def __init__(self, optimizer=None, optimizer_args=None, step_size=0.003, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=10, use_skill_dependent_baseline=False, mlp_skill_dependent_baseline=False, freeze_manager=False, freeze_skills=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Concurrent_PPO, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period self.freeze_manager = freeze_manager self.freeze_skills = freeze_skills assert (not freeze_manager) or (not freeze_skills) # todo: fix this sampler stuff # import pdb; pdb.set_trace() self.sampler = HierBatchSampler(self, self.period) # self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period # self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1 # 1 for the t_remaining skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,) skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) if self.mlp_skill_dependent_baseline: self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec) else: self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
def __init__(self, wrapped_env): self.__wrapped_env = wrapped_env observation_space = wrapped_env.observation_space action_space = wrapped_env.action_space self.observation_space = convert_gym_space(observation_space) self.action_space = convert_gym_space(action_space) self.spec = EnvSpec(self.observation_space, self.action_space)
def __init__( self, optimizer=None, optimizer_args=None, step_size=1e-2, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, use_skill_dependent_baseline=False, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict(batch_size=None, max_epochs=1) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(learning_rate=step_size, **optimizer_args) self.optimizer = optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent_approx, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period self.trainable_manager = self.policy.trainable_manager # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space skill_dependent_obs_space_dim = ( (curr_env.observation_space.shape[0] + 1) * self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_depdendent_env_spec)
def spec(self): """ Returns an EnvSpec. Returns: spec (garage.envs.EnvSpec) """ return EnvSpec(observation_space=self.observation_space, action_space=self.action_space)
def __init__(self, env, base_policy, num_skills, steps_per_option=100): Serializable.quick_init(self, locals()) self._base_policy = base_policy self._env = env self._steps_per_option = steps_per_option self._num_skills = num_skills self.observation_space = self._env.observation_space self.action_space = spaces.Discrete(num_skills) self.spec = EnvSpec(self.observation_space, self.action_space) self._obs = self.reset()
def __init__(self, env, num_skills, z): Serializable.quick_init(self, locals()) self._env = env self._num_skills = num_skills self._z = z obs_space = self._env.observation_space low = np.hstack([obs_space.low, np.full(num_skills, 0)]) high = np.hstack([obs_space.high, np.full(num_skills, 1)]) self.observation_space = spaces.Box(low=low, high=high) self.action_space = self._env.action_space self.spec = EnvSpec(self.observation_space, self.action_space)
def __init__(self, env): Serializable.quick_init(self, locals()) ProxyEnv.__init__(self, env) self.action_space = convert_space_to_tf_space( self._wrapped_env.action_space ) self.observation_space = convert_space_to_tf_space( self._wrapped_env.observation_space ) from rllab.envs.env_spec import EnvSpec self.spec = EnvSpec( observation_space=self.observation_space, action_space=self.action_space, )
def env_create(self): route = '/v1/envs/' data = {'env_id': 'lol'} resp = self._post_request(route, data) instance_id = resp['instance_id'] self.instance_id = instance_id asi = self.env_action_space_info(instance_id) obi = self.env_observation_space_info(instance_id) self.action_space = Box(np.array(asi['low']), np.array(asi['high'])) self.observation_space = Box(np.array(obi['low']), np.array(obi['high'])) self.spec = EnvSpec(self.action_space, self.observation_space) self.spec.id = 0 self.spec.timestep_limit = 500 self.instance_id = instance_id
def goal_env_spec(self, dict_keys=('observation', 'desired_goal')): """ convert selected keys of a dict observation space to a Box space and return the corresponding env_spec. :param dict_keys: (`tuple`) desired keys that you would like to use. :return: (`EnvSpec`) converted object """ assert isinstance(self.observation_space, Dict) obs_dim = np.sum([self.observation_space.spaces[key].flat_dim for key in dict_keys]) obs_space = Box(-np.inf, np.inf, shape=(obs_dim,)) return EnvSpec( observation_space=obs_space, action_space=self.action_space, )
def __init__(self, env): super().__init__(env) Serializable.quick_init(self, locals()) self.keys = ["robot-state", "object-state"] # set up observation and action spaces flat_ob = self._flatten_obs(self.env.reset(), verbose=True) self.obs_dim = flat_ob.size high = np.inf * np.ones(self.obs_dim) low = -high self.observation_space = Box(low=low, high=high) low, high = self.env.action_spec self.action_space = Box(low=low, high=high) self.spec = EnvSpec( observation_space=self.observation_space, action_space=self.action_space, )
def __init__( self, regressor, inputSize, outputSize, left_min, right_min, left_max, right_max, state_representation='all', n_candidates=1000, horizon=10, horiz_penalty_factor=30, backward_discouragement=10, heading_penalty_factor=5, visualize_rviz=True, x_index=0, y_index=1, yaw_cos_index=10, yaw_sin_index=11, test_regressor=False, frequency_value=10, serial_port=None, baud_rate=None, ): self.regressor = regressor self.visualize_rviz = visualize_rviz self.state_representation = state_representation #from config['policy'] self.n_candidates = n_candidates self.horizon = horizon self.horiz_penalty_factor = horiz_penalty_factor self.backward_discouragement = backward_discouragement self.heading_penalty_factor = heading_penalty_factor #from config['roach'] self.x_index = x_index self.y_index = y_index self.yaw_cos_index = yaw_cos_index self.yaw_sin_index = yaw_sin_index #useless self.random = False self.multi_input = False self.is_onehot = False #initialized by the gbac_controller_node self.desired_states = None self.publish_markers_desired = None self.publish_markers = None '''if test_regressor: self.plot = PlotRegressor('RoachEnv') #NOTE: setting this to True doesnt give you an optimal rollout #it plans only evey horizon-steps, and doesnt replan at each step #because it's trying to verify accuracy of regressor''' Serializable.quick_init(self, locals()) #define action space for roach my_obs = Box(low=-1 * np.ones(outputSize, ), high=np.ones(outputSize, )) my_ac = Box(low=np.array([left_min, right_min]), high=np.array([left_max, right_max])) super(NaiveMPCController, self).__init__( env_spec=EnvSpec(observation_space=my_obs, action_space=my_ac))
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) elif variant["env_name"] == "Point2D-v0": import sac.envs.point2d_env env = GymEnv(variant["env_name"]) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict(min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=SimpleSampler( max_path_length=variant["max_path_length"], min_pool_size=variant["max_path_length"], batch_size=variant["batch_size"])) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GaussianPolicy( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], reg=0.001, ) # policy = GMMPolicy( # env_spec=aug_env_spec, # K=variant['K'], # hidden_layer_sizes=[M, M], # qf=qf, # reg=0.001, # ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN(base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], reparametrize=variant["reparametrize"]) algorithm.train()
def __init__( self, setup, host=None, dof=6, control_type='position', derivative_type='none', target_type='position', reset_type='random', reward_type='linear', deriv_action_max=10, first_deriv_max=10, # used only with second derivative control vel_penalty=0, obs_history=1, actuation_sync_period=1, episode_length_time=None, episode_length_step=None, rllab_box=False, servoj_t=ur_utils.COMMANDS['SERVOJ']['default']['t'], servoj_gain=ur_utils.COMMANDS['SERVOJ']['default']['gain'], speedj_a=ur_utils.COMMANDS['SPEEDJ']['default']['a'], speedj_t_min=ur_utils.COMMANDS['SPEEDJ']['default']['t_min'], movej_t=2, # used for resetting accel_max=None, speed_max=None, dt=0.008, delay=0.0, # to simulate extra delay in the system **kwargs): """Inits ReacherEnv class with task and robot specific parameters. Args: setup: a dictionary containing UR5 reacher task specifications, such as safety box dimensions, joint angle ranges, boundary on the arm speed, UR5 Controller IP address etc (see senseact.devices.ur.ur_setups for examples). host: a string specifying UR5 IP address or None dof: an integer number of degrees of freedom, either 2 for Reacher2D or 6 for Reacher6D control_type: a string specifying UR5 control type, either position (using UR5 servoJ commands) or velocity (using UR5 speedJ commands) derivative_type: a string specifying what type of derivative control to use, either "none", "first" or "seconds" target_type: a string specifying in what space to provide target coordinates, either "position" for Cartesian space or "angle" for joints angles space. reset_type: a string specifying whether to reset the arm to a fixed position or to a random position. reward_type: a string specifying the reward function, either "linear" for - d_t, or "precision" for - d_t + exp^( - d_t^2) deriv_action_max: a float specifying maximum value of an action for derivative control first_deriv_max: a float specifying maximum value of a first derivative of action if derivative_type =="second" vel_penalty: a float specifying the weight of a velocity penalty term in the reward function. obs_history: an integer number of sensory packets concatenated into a single observation vector actuation_sync_period: a bool specifying whether to synchronize sending actuation commands to UR5 with receiving sensory packets from UR5 (must be true for smooth UR5 operation). episode_length_time: a float duration of an episode defined in seconds episode_length_step: an integer duration of en episode defined in environment steps. rllab_box: a bool specifying whether to wrap environment action and observation spaces into an RllabBox object (required for off-the-shelf rllab algorithms implementations). servoj_t: a float specifying time parameter of a UR5 servoj command. servoj_gain: a float specifying gain parameter of a UR5 servoj command. speedj_a: a float specifying acceleration parameter of a UR5 speedj command. speedj_t_min: a float specifying t_min parameter of a UR5 speedj command. movej_t: a float specifying time parameter of a UR5 speedj command. accel_max: a float specifying maximum allowed acceleration of UR5 arm joints. If None, a value from setup is used. speed_max: a float specifying maximum allowed speed of UR5 joints. If None, a value from setup is used. dt: a float specifying duration of an environment time step in seconds. delay: a float specifying artificial observation delay in seconds """ # Check that the task parameters chosen are implemented in this class assert dof in [2, 6] assert control_type in ['position', 'velocity', 'acceleration'] assert derivative_type in ['none', 'first', 'second'] assert target_type in ['position', 'angle'] assert reset_type in ['random', 'zero', 'none'] assert actuation_sync_period >= 0 if episode_length_step is not None: assert episode_length_time is None self._episode_length_step = episode_length_step self._episode_length_time = episode_length_step * dt elif episode_length_time is not None: assert episode_length_step is None self._episode_length_time = episode_length_time self._episode_length_step = int(episode_length_time / dt) else: #TODO: should we allow a continuous behaviour case here, with no episodes? print("episode_length_time or episode_length_step needs to be set") raise AssertionError # Task Parameters self._host = setups[setup]['host'] if host is None else host self._obs_history = obs_history self._dof = dof self._control_type = control_type self._derivative_type = derivative_type self._target_type = target_type self._reset_type = reset_type self._reward_type = reward_type self._vel_penalty = vel_penalty # weight of the velocity penalty self._deriv_action_max = deriv_action_max self._first_deriv_max = first_deriv_max self._delay = delay if accel_max == None: accel_max = setups[setup]['accel_max'] if speed_max == None: speed_max = setups[setup]['speed_max'] if self._dof == 6: self._joint_indices = [0, 1, 2, 3, 4, 5] self._end_effector_indices = [0, 1, 2] elif self._dof == 2: self._joint_indices = [1, 2] self._end_effector_indices = [1, 2] # Arm/Control/Safety Parameters self._end_effector_low = setups[setup]['end_effector_low'] self._end_effector_high = setups[setup]['end_effector_high'] self._angles_low = setups[setup]['angles_low'][self._joint_indices] self._angles_high = setups[setup]['angles_high'][self._joint_indices] self._speed_low = -np.ones(self._dof) * speed_max self._speed_high = np.ones(self._dof) * speed_max self._accel_low = -np.ones(self._dof) * accel_max self._accel_high = np.ones(self._dof) * accel_max self._box_bound_buffer = setups[setup]['box_bound_buffer'] self._angle_bound_buffer = setups[setup]['angle_bound_buffer'] self._q_ref = setups[setup]['q_ref'] self._ik_params = setups[setup]['ik_params'] # State Variables self._q_ = np.zeros((self._obs_history, self._dof)) self._qd_ = np.zeros((self._obs_history, self._dof)) self._episode_steps = 0 self._pstop_time_ = None self._pstop_times_ = [] self._safety_mode_ = ur_utils.SafetyModes.NONE self._max_pstop = 10 self._max_pstop_window = 600 self._clear_pstop_after = 2 self._x_target_ = np.frombuffer(Array('f', 3).get_obj(), dtype='float32') self._x_ = np.frombuffer(Array('f', 3).get_obj(), dtype='float32') self._reward_ = Value('d', 0.0) if self._target_type == 'position': if self._dof == 2: self._target_ = np.zeros((2)) self._target_low = self._end_effector_low[ self._end_effector_indices] self._target_high = self._end_effector_high[ self._end_effector_indices] elif self._dof == 6: self._target_ = np.zeros((3)) self._target_low = self._end_effector_low self._target_high = self._end_effector_high elif self._target_type == 'angle': self._target_ = np.zeros((self._dof)) self._target_low = self._angles_low self._target_high = self._angles_high # Set up action and observation space if self._derivative_type == 'second' or self._derivative_type == 'first': self._action_low = -np.ones(self._dof) * self._deriv_action_max self._action_high = np.ones(self._dof) * self._deriv_action_max else: # derivative_type=='none' if self._control_type == 'position': self._action_low = self._angles_low self._action_high = self._angles_high elif self._control_type == 'velocity': self._action_low = self._speed_low self._action_high = self._speed_high elif self._control_type == 'acceleration': self._action_low = self._accel_low self._action_high = self._accel_high # TODO: is there a nicer way to do this? if rllab_box: from rllab.spaces import Box as RlBox # use this for rllab TRPO Box = RlBox else: from gym.spaces import Box as GymBox # use this for baselines algos Box = GymBox self._observation_space = Box( low=np.array( list(self._angles_low * self._obs_history) # q_actual + list(-np.ones(self._dof * self._obs_history)) # qd_actual + list(self._target_low) # target + list(-self._action_low) # previous action in cont space ), high=np.array( list(self._angles_high * self._obs_history) # q_actual + list(np.ones(self._dof * self._obs_history)) # qd_actual + list(self._target_high) # target + list(self._action_high) # previous action in cont space )) self._action_space = Box(low=self._action_low, high=self._action_high) if rllab_box: from rllab.envs.env_spec import EnvSpec self._spec = EnvSpec(self.observation_space, self.action_space) # Only used with second derivative control self._first_deriv_ = np.zeros(len(self.action_space.low)) # Communicator Parameters communicator_setups = { 'UR5': { 'num_sensor_packets': obs_history, 'kwargs': { 'host': self._host, 'actuation_sync_period': actuation_sync_period, 'buffer_len': obs_history + SharedBuffer.DEFAULT_BUFFER_LEN, } } } if self._delay > 0: from senseact.devices.ur.ur_communicator_delay import URCommunicator communicator_setups['UR5']['kwargs']['delay'] = self._delay else: from senseact.devices.ur.ur_communicator import URCommunicator communicator_setups['UR5']['Communicator'] = URCommunicator super(ReacherEnv, self).__init__(communicator_setups=communicator_setups, action_dim=len(self.action_space.low), observation_dim=len(self.observation_space.low), dt=dt, **kwargs) # The last action self._action_ = self._rand_obj_.uniform(self._action_low, self._action_high) # Defining packet structure for quickly building actions self._reset_packet = np.ones( self._actuator_comms['UR5'].actuator_buffer.array_len ) * ur_utils.USE_DEFAULT self._reset_packet[0] = ur_utils.COMMANDS['MOVEJ']['id'] self._reset_packet[1:1 + 6] = self._q_ref self._reset_packet[-2] = movej_t self._servoj_packet = np.ones( self._actuator_comms['UR5'].actuator_buffer.array_len ) * ur_utils.USE_DEFAULT self._servoj_packet[0] = ur_utils.COMMANDS['SERVOJ']['id'] self._servoj_packet[1:1 + 6] = self._q_ref self._servoj_packet[-3] = servoj_t self._servoj_packet[-1] = servoj_gain self._speedj_packet = np.ones( self._actuator_comms['UR5'].actuator_buffer.array_len ) * ur_utils.USE_DEFAULT self._speedj_packet[0] = ur_utils.COMMANDS['SPEEDJ']['id'] self._speedj_packet[1:1 + 6] = np.zeros((6, )) self._speedj_packet[-2] = speedj_a self._speedj_packet[-1] = speedj_t_min # Tell the arm to do nothing (overwritting previous command) self._nothing_packet = np.zeros( self._actuator_comms['UR5'].actuator_buffer.array_len) self._pstop_unlock_packet = np.zeros( self._actuator_comms['UR5'].actuator_buffer.array_len) self._pstop_unlock_packet[0] = ur_utils.COMMANDS['UNLOCK_PSTOP']['id']
import samplers.lowlevel.rarl_parallel_sampler as parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) #env = normalize(MultilaneEnv(),1,True,True,0.001,0.001) #env = normalize(MultilaneEnv()) env = TfEnv(JustEgoEnv(port=9427)) obs1_dim = 4 obs2_dim = 4 action1_dim = 2 action2_dim = 2 spec1 = EnvSpec( observation_space = Box(low=-np.ones(4), high=np.ones(4)), action_space = Box(low=-np.ones(2), high=np.ones(2)), ) spec2 = EnvSpec( observation_space = Box(low=-np.ones(4), high=np.ones(4)), action_space = Box(low=-np.ones(2), high=np.ones(2)), ) with tf.Session() as sess: policy1 = GaussianMLPPolicy( env_spec=spec1, name="RARLTFPolicy1", learn_std=True, init_std=0.1, output_nonlinearity=None, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(256, 128, 64, 32),
def __init__(self, inputSize, outputSize, env, v, learning_rate, batchsize, which_agent, x_index, y_index, num_fc_layers, depth_fc_layers, print_minimal): #init vars #self.sess = sess self.batchsize = batchsize self.which_agent = which_agent self.x_index = x_index self.y_index = y_index self.inputSize = inputSize self.outputSize = outputSize self.print_minimal = print_minimal LOW = -1000000 HIGH = 1000000 self.act_dim = env.spec.action_space.flat_dim self.obs_dim = env.spec.observation_space.flat_dim obs_to_act_spec = env.spec obsact_to_obs_spec = EnvSpec(observation_space=Box( LOW, HIGH, shape=(self.obs_dim + self.act_dim, )), action_space=Box(LOW, HIGH, shape=(self.obs_dim, ))) #TODO: Think, whether to learn std for backwards policy or not. self.bw_act_pol = GaussianMLPPolicy( env_spec=obs_to_act_spec, hidden_sizes=(64, 64), learn_std=v['bw_variance_learn'], ) self.bw_obs_pol = GaussianMLPPolicy( env_spec=obsact_to_obs_spec, hidden_sizes=(v['bw_model_hidden_size'], v['bw_model_hidden_size']), learn_std=v['bw_variance_learn'], hidden_nonlinearity=NL.rectify, ) self.obs_in = TT.matrix('obs_in') self.obsact_in = TT.matrix('obsact_in') self.act_out = TT.matrix('act_out') self.diff_out = TT.matrix('diff_out') bw_learning_rate = v['bw_learning_rate'] self.bw_act_dist = self.bw_act_pol.dist_info_sym(self.obs_in) self.bw_obs_dist = self.bw_obs_pol.dist_info_sym(self.obsact_in) self.bw_act_loss = -TT.sum( self.bw_act_pol.distribution.log_likelihood_sym( self.act_out, self.bw_act_dist)) bw_obs_loss = -TT.sum( self.bw_obs_pol.distribution.log_likelihood_sym( self.diff_out, self.bw_obs_dist)) bw_act_params = self.bw_act_pol.get_params_internal() bw_obs_params = self.bw_obs_pol.get_params_internal() #bw_params = bw_act_params + bw_obs_params bw_s_to_a_update = lasagne.updates.adam(self.bw_act_loss, bw_act_params, learning_rate=bw_learning_rate) bw_sa_to_s_update = lasagne.updates.adam( bw_obs_loss, bw_obs_params, learning_rate=bw_learning_rate) self.bw_act_train = theano.function([self.obs_in, self.act_out], self.bw_act_loss, updates=bw_s_to_a_update, allow_input_downcast=True) self.bw_obs_train = theano.function([self.obsact_in, self.diff_out], bw_obs_loss, updates=bw_sa_to_s_update, allow_input_downcast=True)
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy snn_pkl_path=None, snn_json_path=None, manager_pkl_path=None, # default is to initialize a new manager from scratch period=2, # how often the manager chooses latent skill latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, hidden_sizes_snn=(64, 64), hidden_sizes_selector=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.period = period self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None # something self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager if pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy #following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=env.spec, env=env, pkl_path=snn_pkl_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load( os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) outer_env_spec = EnvSpec( observation_space=self.env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy( env_spec=outer_env_spec, latent_dim=latent_dim, ) Serializable.quick_init(self, locals()) # todo: is this where this belongs?
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy, can take in npz too! snn_pkl_path=None, # can actually be either pkl or npz snn_json_path=None, period=10, # how often the manager chooses latent skill latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, hidden_sizes_snn=(64, 64), hidden_sizes_manager=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.period = period self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None self.curr_manager_obs = None self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager self.trainable_snn = trainable_snn if pkl_path and '.npz' not in pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy #todo: the above is wrong, need to figure out how to warm start the params #following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: if snn_pkl_path is not None and '.npz' in snn_pkl_path: npz_path = snn_pkl_path snn_pkl_path = None else: npz_path = None self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=env.spec, env=env, pkl_path=snn_pkl_path, npz_path=npz_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load( os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) if self.continuous_latent: outer_env_spec = EnvSpec( observation_space=self.env.observation_space, action_space=spaces.Box(-1.0, 1.0, shape=(latent_dim, ))) self.manager = GaussianMLPPolicy(env_spec=outer_env_spec) else: outer_env_spec = EnvSpec( observation_space=self.env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy( env_spec=outer_env_spec, latent_dim=latent_dim, ) # import ipdb; ipdb.set_trace() if pkl_path is not None and '.npz' in pkl_path: param_dict = dict( np.load(os.path.join(config.PROJECT_PATH, pkl_path))) param_values = param_dict['params'] self.set_param_values(param_values) Serializable.quick_init(self, locals()) # todo: is this where this belongs?
def run_experiment(variant): # print('MuJoCo') # env = normalize(GymEnv('HalfCheetah-v1')) # ----------------------------------------------------- print('Unity3D environment') env = UnityEnv('/home/recharrs/Apps/UnityEnvob3/RollerBall.x86_64', time_state=True, idx=args.idx, no_graphics=args.no_graphics) # ----------------------------------------------------- obs_space = env.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low.flatten(), np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high.flatten(), np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=5000, ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=0, # must set to 0 or it will be error eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN_BD( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], # Additional params for behaviour tracking metric=variant['metric'], env_id=variant['prefix'], eval_freq=variant['eval_freq'], log_dir=get_logdir(args, variant), ) algorithm.train()
def spec(self): return EnvSpec( observation_space=self.env_agents[0].observation_space, action_space=self.env_agents[0].action_space, )
def spec(self): return EnvSpec( observation_space=self.observation_space, action_space=self.action_space, )
def __init__(self, setup='dxl_tracker_default', idn=9, baudrate=1000000, obs_history=2, dt=0.01, gripper_dt=0.006, rllab_box=False, episode_length_step=None, episode_length_time=4, dof=1, max_torque_mag=300, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', delay=0, dxl_dev_path='None', max_velocity=5, history_dt=None, use_ctypes_driver=True, **kwargs ): """ Inits DxlTracker1DEnv class with task and servo specific parameters. Args: setup: A dictionary containing DXL tracker task specifications, such as bounding box dimensions, joint angle ranges and max load. idn: An integer representing the DXL ID number baudrate: An integer representing a baudrate to connect at obs_history: An integer number of sensory packets concatenated into a single observation vector dt: A float specifying duration of an environment time step in seconds. gripper_dt: A float representing DXLCommunicator cycle time rllab_box: A bool specifying whether to wrap environment action and observation spaces into an RllabBox object (required for off-the-shelf rllab algorithms implementations). episode_length_time: A float duration of an episode defined in seconds episode_length_step: An integer duration of en episode defined in environment steps. dof: an integer number of degrees of freedom max_torque_mag: An integer representing max possible torque command to be sent to the DXL devive control_type: target_type: A string specifying in what space to provide target coordinates, either "position" for Cartesian space or "angle" for joints angles space. reset_type: A string specifying whether to reset the arm to a fixed position or to a random position. reward_type: A string specifying the reward function, (e.g., "linear" for - d_t) delay: A float specifying artificial observation delay in seconds dxl_dev_path: A string containing the serial port address (e.g., /dev/ttyACM0 or /dev/ttyUSB0 on linux) max_velocity: A float representing the max possible velocity command to be sent to the DXL device history_dt: A float that indicates that chooses last action as action chosen at t - history_dt use_ctypes_driver: A bool. Setting it to True chooses CType-based driver. We found the CType-based driver to provide substantially more timely and precise communication compared to the pyserial-based one. **kwargs: Keyword arguments """ self.max_temperature = 60 self.cool_down_temperature = 50 self.obs_history = obs_history self.dt = dt self.gripper_dt = gripper_dt self.max_torque_mag = np.array([max_torque_mag]) self.max_velocity = np.array([max_velocity]) if rllab_box: from rllab.spaces import Box as RlBox # use this for rllab TRPO from rllab.envs.env_spec import EnvSpec Box = RlBox self._spec = EnvSpec(self.observation_space, self.action_space) else: from gym.spaces import Box as GymBox # use this for baselines algos Box = GymBox if control_type not in ['torque']: raise NotImplementedError('{} control not implemented'.format(control_type)) self.control_type = control_type if target_type not in ['position']: raise NotImplementedError('{} target not implemented'.format(target_type)) self.target_type = target_type if reset_type not in ['zero', 'random' ,'none']: raise NotImplementedError('{} reset not implemented'.format(reset_type)) self.reset_type = reset_type if reward_type not in ['linear']: raise NotImplementedError('{} reward not implemented'.format(reward_type)) self.reward_type = reward_type if control_type == 'torque': self.action_low = -self.max_torque_mag self.action_high = +self.max_torque_mag elif control_type == 'velocity': self.action_low = -self.max_velocity self.action_high = +self.max_velocity if setup not in setups: raise NotImplementedError('Config not found') self.angle_low = setups[setup]['angles_low'][0] self.angle_high = setups[setup]['angles_high'][0] # Load value for detecting a closed gripper during reset self.high_load = setups[setup]['high_load'][0] self._present_pos_ = np.zeros((obs_history, 1)) self._observation_space = Box( low=np.array( # list(0*np.ones(self.obs_history)) # torque enable # + list(0*np.ones(self.obs_history)) # alarm led # + list(0*np.ones(self.obs_history)) # led list(-pi * np.ones(self.obs_history)) # present position + list(self.angle_low * np.ones(2)) # target position + list(-np.inf * np.ones(self.obs_history)) # present speed # + list(-np.inf*np.ones(self.obs_history)) # present load # + list(0 * np.ones(self.obs_history)) # temperature # + list(0*np.ones(self.obs_history)) # registered # + list(0*np.ones(self.obs_history)) # moving # + list(-np.inf * np.ones(self.obs_history)) # current # + list(-np.inf*np.ones(self.obs_history)) # voltage + list(self.action_low * np.ones(self.obs_history)) # last action ), high=np.array( # list(1 * np.ones(self.obs_history)) # torque enable # + list(128 * np.ones(self.obs_history)) # alarm led # + list(1 * np.ones(self.obs_history)) # led list(pi * np.ones(self.obs_history)) # present position + list(self.angle_high * np.ones(2)) # target position + list(+np.inf * np.ones(self.obs_history)) # present speed # + list(+np.inf * np.ones(self.obs_history)) # present load # + list(255 * np.ones(self.obs_history)) # temperature # + list(1 * np.ones(self.obs_history)) # registered # + list(1 * np.ones(self.obs_history)) # moving # + list(+np.inf * np.ones(self.obs_history)) # current # + list(+np.inf * np.ones(self.obs_history)) # voltage + list(self.action_high * np.ones(self.obs_history)) # last action ) ) self._action_space = Box(low=self.action_low, high=self.action_high) self._comm_name = 'DxlTracker1D' self._dxl_dev_path = dxl_dev_path communicator_setups = { self._comm_name: { 'Communicator': gcomm.DXLCommunicator, 'num_sensor_packets': obs_history, 'kwargs': { 'idn': idn, 'baudrate': baudrate, 'sensor_dt': gripper_dt, 'device_path': self._dxl_dev_path, 'use_ctypes_driver': use_ctypes_driver, } } } super(DxlTracker1DEnv, self).__init__( communicator_setups=communicator_setups, action_dim=1, observation_dim=self.observation_space.shape[0], dt=dt, **kwargs ) read_block = dxl_mx64.MX64.subblock('version_0', 'goal_acceleration', ret_dxl_type=use_ctypes_driver) self.regnames = [reg.name for reg in read_block] self.reg_index = dict(zip(self.regnames, range(len(self.regnames)))) self.episode_steps = Value('i', 0) if episode_length_step is not None: assert episode_length_time is None self.episode_length_step = episode_length_step self.episode_length_time = episode_length_step * dt elif episode_length_time is not None: assert episode_length_step is None self.episode_length_time = episode_length_time self.episode_length_step = int(episode_length_time / dt) else: # TODO: should we allow a continuous behaviour case here, with no episodes? print("episode_length_time or episode_length_step needs to be set") raise AssertionError # Task Parameters self.obs_history = obs_history self.dof = dof self.delay = delay self.pos_range = self.angle_high - self.angle_low if history_dt is not None: self.history_dt = history_dt self.dt_ratio = int(history_dt/self.gripper_dt) else: self.dt_ratio = int(self.dt/self.gripper_dt) self.comm_episode_length_step = self.episode_length_step * self.dt_ratio # Default initialization target_pos = np.random.uniform(low=self.angle_low, high=self.angle_high) self.pos_range = self.angle_high - self.angle_low self.reset_pos_center = self.angle_high - (self.pos_range / 2.) self.action_range = self.action_high - self.action_low self._reward_ = Value('d', 0.0) self._reset_pos_ = Value('d', self.reset_pos_center) self._present_pos_ = np.frombuffer(Array('f', self.obs_history).get_obj(), dtype='float32') self._target_pos_ = Value('d', target_pos) self._temperature_ = [0] * self.obs_history self._action_history = deque([0] * (self.obs_history + 1), self.obs_history + 1) # Generate trajectory self.generate_trajectory() # Tell the dxl to do nothing (overwritting previous command) self.nothing_packet = np.zeros(self._actuator_comms[self._comm_name].actuator_buffer.array_len) # PID control gains for reset self.kp = 161.1444 # Proportional gain self.ki = 0 # Integral gain self.kd = 0 # Derivative gain
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy snn_pkl_path=None, snn_json_path=None, manager_pkl_path=None, # default is to initialize a new manager from scratch max_period=10, # possible periods latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, hidden_sizes_snn=(64, 64), hidden_sizes_selector=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.periods = np.arange(1, max_period + 1) assert len(self.periods) > 0 self.curr_period = self.periods[0] self.max_period = max(self.periods) self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None # something self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager self.random_period = True self.fake_env = PeriodVaryingEnv(env) if pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy # following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: # env spec that includes the extra parameter for time self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=self.fake_env.spec, env=self.fake_env, pkl_path=snn_pkl_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector ) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load(os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) outer_env_spec = EnvSpec(observation_space=self.fake_env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 Serializable.quick_init(self, locals()) # todo: ask if this fixes my problem