def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.003,
                 num_latents=6,
                 latents=None,  # some sort of iterable of the actual latent vectors
                 period=10,  # how often I choose a latent
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=10,
                 use_skill_dependent_baseline=False,
                 mlp_skill_dependent_baseline=False,
                 freeze_manager=False,
                 freeze_skills=False,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Concurrent_PPO, self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,)
            skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
Exemplo n.º 2
0
 def __init__(self, wrapped_env):
     self.__wrapped_env = wrapped_env
     observation_space = wrapped_env.observation_space
     action_space = wrapped_env.action_space
     self.observation_space = convert_gym_space(observation_space)
     self.action_space = convert_gym_space(action_space)
     self.spec = EnvSpec(self.observation_space, self.action_space)
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=1e-2,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            use_skill_dependent_baseline=False,
            **kwargs):
        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(batch_size=None, max_epochs=1)
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                            **optimizer_args)
        self.optimizer = optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent_approx,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period

        self.trainable_manager = self.policy.trainable_manager

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            skill_dependent_obs_space_dim = (
                (curr_env.observation_space.shape[0] + 1) * self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space,
                                                skill_dependent_action_space)
            self.skill_dependent_baseline = LinearFeatureBaseline(
                env_spec=skill_depdendent_env_spec)
Exemplo n.º 4
0
    def spec(self):
        """
        Returns an EnvSpec.

        Returns:
            spec (garage.envs.EnvSpec)
        """
        return EnvSpec(observation_space=self.observation_space,
                       action_space=self.action_space)
Exemplo n.º 5
0
 def __init__(self, env, base_policy, num_skills, steps_per_option=100):
     Serializable.quick_init(self, locals())
     self._base_policy = base_policy
     self._env = env
     self._steps_per_option = steps_per_option
     self._num_skills = num_skills
     self.observation_space = self._env.observation_space
     self.action_space = spaces.Discrete(num_skills)
     self.spec = EnvSpec(self.observation_space, self.action_space)
     self._obs = self.reset()
Exemplo n.º 6
0
 def __init__(self, env, num_skills, z):
     Serializable.quick_init(self, locals())
     self._env = env
     self._num_skills = num_skills
     self._z = z
     obs_space = self._env.observation_space
     low = np.hstack([obs_space.low, np.full(num_skills, 0)])
     high = np.hstack([obs_space.high, np.full(num_skills, 1)])
     self.observation_space = spaces.Box(low=low, high=high)
     self.action_space = self._env.action_space
     self.spec = EnvSpec(self.observation_space, self.action_space)
Exemplo n.º 7
0
 def __init__(self, env):
     Serializable.quick_init(self, locals())
     ProxyEnv.__init__(self, env)
     self.action_space = convert_space_to_tf_space(
         self._wrapped_env.action_space
     )
     self.observation_space = convert_space_to_tf_space(
         self._wrapped_env.observation_space
     )
     from rllab.envs.env_spec import EnvSpec
     self.spec = EnvSpec(
         observation_space=self.observation_space,
         action_space=self.action_space,
     )
Exemplo n.º 8
0
 def env_create(self):
     route = '/v1/envs/'
     data = {'env_id': 'lol'}
     resp = self._post_request(route, data)
     instance_id = resp['instance_id']
     self.instance_id = instance_id
     asi = self.env_action_space_info(instance_id)
     obi = self.env_observation_space_info(instance_id)
     self.action_space = Box(np.array(asi['low']), np.array(asi['high']))
     self.observation_space = Box(np.array(obi['low']),
                                  np.array(obi['high']))
     self.spec = EnvSpec(self.action_space, self.observation_space)
     self.spec.id = 0
     self.spec.timestep_limit = 500
     self.instance_id = instance_id
Exemplo n.º 9
0
    def goal_env_spec(self, dict_keys=('observation', 'desired_goal')):
        """
        convert selected keys of a dict observation space to a Box space
        and return the corresponding env_spec.
        :param dict_keys: (`tuple`) desired keys that you would like to use.
        :return: (`EnvSpec`) converted object
        """
        assert isinstance(self.observation_space, Dict)
        obs_dim = np.sum([self.observation_space.spaces[key].flat_dim for key in dict_keys])
        obs_space = Box(-np.inf, np.inf, shape=(obs_dim,))

        return EnvSpec(
            observation_space=obs_space,
            action_space=self.action_space,
        )
Exemplo n.º 10
0
    def __init__(self, env):
        super().__init__(env)
        Serializable.quick_init(self, locals())
        self.keys = ["robot-state", "object-state"]

        # set up observation and action spaces
        flat_ob = self._flatten_obs(self.env.reset(), verbose=True)
        self.obs_dim = flat_ob.size
        high = np.inf * np.ones(self.obs_dim)
        low = -high
        self.observation_space = Box(low=low, high=high)
        low, high = self.env.action_spec
        self.action_space = Box(low=low, high=high)

        self.spec = EnvSpec(
            observation_space=self.observation_space,
            action_space=self.action_space,
        )
Exemplo n.º 11
0
    def __init__(
        self,
        regressor,
        inputSize,
        outputSize,
        left_min,
        right_min,
        left_max,
        right_max,
        state_representation='all',
        n_candidates=1000,
        horizon=10,
        horiz_penalty_factor=30,
        backward_discouragement=10,
        heading_penalty_factor=5,
        visualize_rviz=True,
        x_index=0,
        y_index=1,
        yaw_cos_index=10,
        yaw_sin_index=11,
        test_regressor=False,
        frequency_value=10,
        serial_port=None,
        baud_rate=None,
    ):
        self.regressor = regressor
        self.visualize_rviz = visualize_rviz
        self.state_representation = state_representation

        #from config['policy']
        self.n_candidates = n_candidates
        self.horizon = horizon
        self.horiz_penalty_factor = horiz_penalty_factor
        self.backward_discouragement = backward_discouragement
        self.heading_penalty_factor = heading_penalty_factor

        #from config['roach']
        self.x_index = x_index
        self.y_index = y_index
        self.yaw_cos_index = yaw_cos_index
        self.yaw_sin_index = yaw_sin_index

        #useless
        self.random = False
        self.multi_input = False
        self.is_onehot = False

        #initialized by the gbac_controller_node
        self.desired_states = None
        self.publish_markers_desired = None
        self.publish_markers = None
        '''if test_regressor:
            self.plot = PlotRegressor('RoachEnv')
            #NOTE: setting this to True doesnt give you an optimal rollout
                #it plans only evey horizon-steps, and doesnt replan at each step
                    #because it's trying to verify accuracy of regressor'''
        Serializable.quick_init(self, locals())

        #define action space for roach
        my_obs = Box(low=-1 * np.ones(outputSize, ),
                     high=np.ones(outputSize, ))
        my_ac = Box(low=np.array([left_min, right_min]),
                    high=np.array([left_max, right_max]))
        super(NaiveMPCController, self).__init__(
            env_spec=EnvSpec(observation_space=my_obs, action_space=my_ac))
Exemplo n.º 12
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    elif variant["env_name"] == "Point2D-v0":
        import sac.envs.point2d_env
        env = GymEnv(variant["env_name"])
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(min_pool_size=variant['max_path_length'],
                       epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       max_path_length=variant['max_path_length'],
                       batch_size=variant['batch_size'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=SimpleSampler(
                           max_path_length=variant["max_path_length"],
                           min_pool_size=variant["max_path_length"],
                           batch_size=variant["batch_size"]))

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GaussianPolicy(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
        reg=0.001,
    )

    # policy = GMMPolicy(
    #     env_spec=aug_env_spec,
    #     K=variant['K'],
    #     hidden_layer_sizes=[M, M],
    #     qf=qf,
    #     reg=0.001,
    # )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(base_kwargs=base_kwargs,
                      env=env,
                      policy=policy,
                      discriminator=discriminator,
                      pool=pool,
                      qf=qf,
                      vf=vf,
                      lr=variant['lr'],
                      scale_entropy=variant['scale_entropy'],
                      discount=variant['discount'],
                      tau=variant['tau'],
                      num_skills=variant['num_skills'],
                      save_full_state=False,
                      include_actions=variant['include_actions'],
                      learn_p_z=variant['learn_p_z'],
                      add_p_z=variant['add_p_z'],
                      reparametrize=variant["reparametrize"])

    algorithm.train()
Exemplo n.º 13
0
    def __init__(
            self,
            setup,
            host=None,
            dof=6,
            control_type='position',
            derivative_type='none',
            target_type='position',
            reset_type='random',
            reward_type='linear',
            deriv_action_max=10,
            first_deriv_max=10,  # used only with second derivative control
            vel_penalty=0,
            obs_history=1,
            actuation_sync_period=1,
            episode_length_time=None,
            episode_length_step=None,
            rllab_box=False,
            servoj_t=ur_utils.COMMANDS['SERVOJ']['default']['t'],
            servoj_gain=ur_utils.COMMANDS['SERVOJ']['default']['gain'],
            speedj_a=ur_utils.COMMANDS['SPEEDJ']['default']['a'],
            speedj_t_min=ur_utils.COMMANDS['SPEEDJ']['default']['t_min'],
            movej_t=2,  # used for resetting
            accel_max=None,
            speed_max=None,
            dt=0.008,
            delay=0.0,  # to simulate extra delay in the system
            **kwargs):
        """Inits ReacherEnv class with task and robot specific parameters.

        Args:
            setup: a dictionary containing UR5 reacher task specifications,
                such as safety box dimensions, joint angle ranges, boundary
                on the arm speed, UR5 Controller IP address etc
                (see senseact.devices.ur.ur_setups for examples).
            host: a string specifying UR5 IP address or None
            dof: an integer number of degrees of freedom, either 2 for
                Reacher2D or 6 for Reacher6D
            control_type: a string specifying UR5 control type, either
                position (using UR5 servoJ commands) or velocity
                (using UR5 speedJ commands)
            derivative_type: a string specifying what type of derivative
                control to use, either "none", "first" or "seconds"
            target_type: a string specifying in what space to provide
                target coordinates, either "position" for Cartesian space
                or "angle" for joints angles space.
            reset_type: a string specifying whether to reset the arm to a
                fixed position or to a random position.
            reward_type: a string specifying the reward function, either
                "linear" for - d_t, or "precision" for  - d_t + exp^( - d_t^2)
            deriv_action_max: a float specifying maximum value of an action
                for derivative control
            first_deriv_max: a float specifying maximum value of a first
                derivative of action if derivative_type =="second"
            vel_penalty: a float specifying the weight of a velocity
                penalty term in the reward function.
            obs_history: an integer number of sensory packets concatenated
                into a single observation vector
            actuation_sync_period: a bool specifying whether to synchronize
                sending actuation commands to UR5 with receiving sensory
                packets from UR5 (must be true for smooth UR5 operation).
            episode_length_time: a float duration of an episode defined
                in seconds
            episode_length_step: an integer duration of en episode
                defined in environment steps.
            rllab_box: a bool specifying whether to wrap environment
                action and observation spaces into an RllabBox object
                (required for off-the-shelf rllab algorithms implementations).
            servoj_t: a float specifying time parameter of a UR5
                servoj command.
            servoj_gain: a float specifying gain parameter of a UR5
                servoj command.
            speedj_a: a float specifying acceleration parameter of a UR5
                speedj command.
            speedj_t_min: a float specifying t_min parameter of a UR5
                speedj command.
            movej_t: a float specifying time parameter of a UR5
                speedj command.
            accel_max: a float specifying maximum allowed acceleration
                of UR5 arm joints. If None, a value from setup is used.
            speed_max: a float specifying maximum allowed speed of UR5 joints.
                If None, a value from setup is used.
            dt: a float specifying duration of an environment time step
                in seconds.
            delay: a float specifying artificial observation delay in seconds

        """

        # Check that the task parameters chosen are implemented in this class
        assert dof in [2, 6]
        assert control_type in ['position', 'velocity', 'acceleration']
        assert derivative_type in ['none', 'first', 'second']
        assert target_type in ['position', 'angle']
        assert reset_type in ['random', 'zero', 'none']
        assert actuation_sync_period >= 0

        if episode_length_step is not None:
            assert episode_length_time is None
            self._episode_length_step = episode_length_step
            self._episode_length_time = episode_length_step * dt
        elif episode_length_time is not None:
            assert episode_length_step is None
            self._episode_length_time = episode_length_time
            self._episode_length_step = int(episode_length_time / dt)
        else:
            #TODO: should we allow a continuous behaviour case here, with no episodes?
            print("episode_length_time or episode_length_step needs to be set")
            raise AssertionError

        # Task Parameters
        self._host = setups[setup]['host'] if host is None else host
        self._obs_history = obs_history
        self._dof = dof
        self._control_type = control_type
        self._derivative_type = derivative_type
        self._target_type = target_type
        self._reset_type = reset_type
        self._reward_type = reward_type
        self._vel_penalty = vel_penalty  # weight of the velocity penalty
        self._deriv_action_max = deriv_action_max
        self._first_deriv_max = first_deriv_max
        self._delay = delay
        if accel_max == None:
            accel_max = setups[setup]['accel_max']
        if speed_max == None:
            speed_max = setups[setup]['speed_max']
        if self._dof == 6:
            self._joint_indices = [0, 1, 2, 3, 4, 5]
            self._end_effector_indices = [0, 1, 2]
        elif self._dof == 2:
            self._joint_indices = [1, 2]
            self._end_effector_indices = [1, 2]

        # Arm/Control/Safety Parameters
        self._end_effector_low = setups[setup]['end_effector_low']
        self._end_effector_high = setups[setup]['end_effector_high']
        self._angles_low = setups[setup]['angles_low'][self._joint_indices]
        self._angles_high = setups[setup]['angles_high'][self._joint_indices]
        self._speed_low = -np.ones(self._dof) * speed_max
        self._speed_high = np.ones(self._dof) * speed_max
        self._accel_low = -np.ones(self._dof) * accel_max
        self._accel_high = np.ones(self._dof) * accel_max

        self._box_bound_buffer = setups[setup]['box_bound_buffer']
        self._angle_bound_buffer = setups[setup]['angle_bound_buffer']
        self._q_ref = setups[setup]['q_ref']
        self._ik_params = setups[setup]['ik_params']

        # State Variables
        self._q_ = np.zeros((self._obs_history, self._dof))
        self._qd_ = np.zeros((self._obs_history, self._dof))

        self._episode_steps = 0

        self._pstop_time_ = None
        self._pstop_times_ = []
        self._safety_mode_ = ur_utils.SafetyModes.NONE
        self._max_pstop = 10
        self._max_pstop_window = 600
        self._clear_pstop_after = 2
        self._x_target_ = np.frombuffer(Array('f', 3).get_obj(),
                                        dtype='float32')
        self._x_ = np.frombuffer(Array('f', 3).get_obj(), dtype='float32')
        self._reward_ = Value('d', 0.0)

        if self._target_type == 'position':
            if self._dof == 2:
                self._target_ = np.zeros((2))
                self._target_low = self._end_effector_low[
                    self._end_effector_indices]
                self._target_high = self._end_effector_high[
                    self._end_effector_indices]
            elif self._dof == 6:
                self._target_ = np.zeros((3))
                self._target_low = self._end_effector_low
                self._target_high = self._end_effector_high
        elif self._target_type == 'angle':
            self._target_ = np.zeros((self._dof))
            self._target_low = self._angles_low
            self._target_high = self._angles_high

        # Set up action and observation space

        if self._derivative_type == 'second' or self._derivative_type == 'first':
            self._action_low = -np.ones(self._dof) * self._deriv_action_max
            self._action_high = np.ones(self._dof) * self._deriv_action_max
        else:  # derivative_type=='none'
            if self._control_type == 'position':
                self._action_low = self._angles_low
                self._action_high = self._angles_high
            elif self._control_type == 'velocity':
                self._action_low = self._speed_low
                self._action_high = self._speed_high
            elif self._control_type == 'acceleration':
                self._action_low = self._accel_low
                self._action_high = self._accel_high

        # TODO: is there a nicer way to do this?
        if rllab_box:
            from rllab.spaces import Box as RlBox  # use this for rllab TRPO
            Box = RlBox
        else:
            from gym.spaces import Box as GymBox  # use this for baselines algos
            Box = GymBox

        self._observation_space = Box(
            low=np.array(
                list(self._angles_low * self._obs_history)  # q_actual
                + list(-np.ones(self._dof * self._obs_history))  # qd_actual
                + list(self._target_low)  # target
                + list(-self._action_low)  # previous action in cont space
            ),
            high=np.array(
                list(self._angles_high * self._obs_history)  # q_actual
                + list(np.ones(self._dof * self._obs_history))  # qd_actual
                + list(self._target_high)  # target
                + list(self._action_high)  # previous action in cont space
            ))

        self._action_space = Box(low=self._action_low, high=self._action_high)

        if rllab_box:
            from rllab.envs.env_spec import EnvSpec
            self._spec = EnvSpec(self.observation_space, self.action_space)

        # Only used with second derivative control
        self._first_deriv_ = np.zeros(len(self.action_space.low))

        # Communicator Parameters
        communicator_setups = {
            'UR5': {
                'num_sensor_packets': obs_history,
                'kwargs': {
                    'host': self._host,
                    'actuation_sync_period': actuation_sync_period,
                    'buffer_len':
                    obs_history + SharedBuffer.DEFAULT_BUFFER_LEN,
                }
            }
        }
        if self._delay > 0:
            from senseact.devices.ur.ur_communicator_delay import URCommunicator
            communicator_setups['UR5']['kwargs']['delay'] = self._delay
        else:
            from senseact.devices.ur.ur_communicator import URCommunicator
        communicator_setups['UR5']['Communicator'] = URCommunicator

        super(ReacherEnv,
              self).__init__(communicator_setups=communicator_setups,
                             action_dim=len(self.action_space.low),
                             observation_dim=len(self.observation_space.low),
                             dt=dt,
                             **kwargs)

        # The last action
        self._action_ = self._rand_obj_.uniform(self._action_low,
                                                self._action_high)

        # Defining packet structure for quickly building actions
        self._reset_packet = np.ones(
            self._actuator_comms['UR5'].actuator_buffer.array_len
        ) * ur_utils.USE_DEFAULT
        self._reset_packet[0] = ur_utils.COMMANDS['MOVEJ']['id']
        self._reset_packet[1:1 + 6] = self._q_ref
        self._reset_packet[-2] = movej_t

        self._servoj_packet = np.ones(
            self._actuator_comms['UR5'].actuator_buffer.array_len
        ) * ur_utils.USE_DEFAULT
        self._servoj_packet[0] = ur_utils.COMMANDS['SERVOJ']['id']
        self._servoj_packet[1:1 + 6] = self._q_ref
        self._servoj_packet[-3] = servoj_t
        self._servoj_packet[-1] = servoj_gain

        self._speedj_packet = np.ones(
            self._actuator_comms['UR5'].actuator_buffer.array_len
        ) * ur_utils.USE_DEFAULT
        self._speedj_packet[0] = ur_utils.COMMANDS['SPEEDJ']['id']
        self._speedj_packet[1:1 + 6] = np.zeros((6, ))
        self._speedj_packet[-2] = speedj_a
        self._speedj_packet[-1] = speedj_t_min

        # Tell the arm to do nothing (overwritting previous command)
        self._nothing_packet = np.zeros(
            self._actuator_comms['UR5'].actuator_buffer.array_len)

        self._pstop_unlock_packet = np.zeros(
            self._actuator_comms['UR5'].actuator_buffer.array_len)
        self._pstop_unlock_packet[0] = ur_utils.COMMANDS['UNLOCK_PSTOP']['id']
import samplers.lowlevel.rarl_parallel_sampler as parallel_sampler
parallel_sampler.initialize(n_parallel=1)
parallel_sampler.set_seed(0)

#env = normalize(MultilaneEnv(),1,True,True,0.001,0.001)
#env = normalize(MultilaneEnv())
env = TfEnv(JustEgoEnv(port=9427))

obs1_dim = 4
obs2_dim = 4
action1_dim = 2
action2_dim = 2

spec1 = EnvSpec(
                observation_space = Box(low=-np.ones(4), high=np.ones(4)),
                action_space = Box(low=-np.ones(2), high=np.ones(2)),
                )
spec2 = EnvSpec(
                observation_space = Box(low=-np.ones(4), high=np.ones(4)),
                action_space = Box(low=-np.ones(2), high=np.ones(2)),
                )

with tf.Session() as sess:
    policy1 = GaussianMLPPolicy(
        env_spec=spec1,
        name="RARLTFPolicy1",
        learn_std=True,
        init_std=0.1,
        output_nonlinearity=None,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(256, 128, 64, 32),
Exemplo n.º 15
0
    def __init__(self, inputSize, outputSize, env, v, learning_rate, batchsize,
                 which_agent, x_index, y_index, num_fc_layers, depth_fc_layers,
                 print_minimal):

        #init vars
        #self.sess = sess
        self.batchsize = batchsize
        self.which_agent = which_agent
        self.x_index = x_index
        self.y_index = y_index
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.print_minimal = print_minimal

        LOW = -1000000
        HIGH = 1000000
        self.act_dim = env.spec.action_space.flat_dim
        self.obs_dim = env.spec.observation_space.flat_dim
        obs_to_act_spec = env.spec
        obsact_to_obs_spec = EnvSpec(observation_space=Box(
            LOW, HIGH, shape=(self.obs_dim + self.act_dim, )),
                                     action_space=Box(LOW,
                                                      HIGH,
                                                      shape=(self.obs_dim, )))

        #TODO: Think, whether to learn std for backwards policy or not.
        self.bw_act_pol = GaussianMLPPolicy(
            env_spec=obs_to_act_spec,
            hidden_sizes=(64, 64),
            learn_std=v['bw_variance_learn'],
        )

        self.bw_obs_pol = GaussianMLPPolicy(
            env_spec=obsact_to_obs_spec,
            hidden_sizes=(v['bw_model_hidden_size'],
                          v['bw_model_hidden_size']),
            learn_std=v['bw_variance_learn'],
            hidden_nonlinearity=NL.rectify,
        )

        self.obs_in = TT.matrix('obs_in')
        self.obsact_in = TT.matrix('obsact_in')
        self.act_out = TT.matrix('act_out')
        self.diff_out = TT.matrix('diff_out')

        bw_learning_rate = v['bw_learning_rate']
        self.bw_act_dist = self.bw_act_pol.dist_info_sym(self.obs_in)
        self.bw_obs_dist = self.bw_obs_pol.dist_info_sym(self.obsact_in)
        self.bw_act_loss = -TT.sum(
            self.bw_act_pol.distribution.log_likelihood_sym(
                self.act_out, self.bw_act_dist))
        bw_obs_loss = -TT.sum(
            self.bw_obs_pol.distribution.log_likelihood_sym(
                self.diff_out, self.bw_obs_dist))

        bw_act_params = self.bw_act_pol.get_params_internal()
        bw_obs_params = self.bw_obs_pol.get_params_internal()
        #bw_params = bw_act_params + bw_obs_params
        bw_s_to_a_update = lasagne.updates.adam(self.bw_act_loss,
                                                bw_act_params,
                                                learning_rate=bw_learning_rate)
        bw_sa_to_s_update = lasagne.updates.adam(
            bw_obs_loss, bw_obs_params, learning_rate=bw_learning_rate)

        self.bw_act_train = theano.function([self.obs_in, self.act_out],
                                            self.bw_act_loss,
                                            updates=bw_s_to_a_update,
                                            allow_input_downcast=True)
        self.bw_obs_train = theano.function([self.obsact_in, self.diff_out],
                                            bw_obs_loss,
                                            updates=bw_sa_to_s_update,
                                            allow_input_downcast=True)
    def __init__(
        self,
        env_spec,
        env,  # the inner one, I believe
        pkl_path=None,  # for the entire hierarchical policy
        snn_pkl_path=None,
        snn_json_path=None,
        manager_pkl_path=None,  # default is to initialize a new manager from scratch
        period=2,  # how often the manager chooses latent skill
        latent_dim=6,
        bilinear_integration=True,
        trainable_snn=True,
        trainable_manager=True,
        hidden_sizes_snn=(64, 64),
        hidden_sizes_selector=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.period = period
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None  # something
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager

        if pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy

            #following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=env.spec,
                env=env,
                pkl_path=snn_pkl_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector)

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(
                    os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)
                outer_env_spec = EnvSpec(
                    observation_space=self.env.observation_space,
                    action_space=self.outer_action_space)
                self.manager = CategoricalMLPPolicy(
                    env_spec=outer_env_spec,
                    latent_dim=latent_dim,
                )
        Serializable.quick_init(self,
                                locals())  # todo: is this where this belongs?
    def __init__(
        self,
        env_spec,
        env,  # the inner one, I believe
        pkl_path=None,  # for the entire hierarchical policy, can take in npz too!
        snn_pkl_path=None,  # can actually be either pkl or npz
        snn_json_path=None,
        period=10,  # how often the manager chooses latent skill
        latent_dim=6,
        bilinear_integration=True,
        trainable_snn=True,
        trainable_manager=True,
        hidden_sizes_snn=(64, 64),
        hidden_sizes_manager=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.period = period
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None
        self.curr_manager_obs = None
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager
        self.trainable_snn = trainable_snn

        if pkl_path and '.npz' not in pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy
            #todo: the above is wrong, need to figure out how to warm start the params

            #following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            if snn_pkl_path is not None and '.npz' in snn_pkl_path:
                npz_path = snn_pkl_path
                snn_pkl_path = None
            else:
                npz_path = None

            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=env.spec,
                env=env,
                pkl_path=snn_pkl_path,
                npz_path=npz_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector)

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(
                    os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)

                if self.continuous_latent:
                    outer_env_spec = EnvSpec(
                        observation_space=self.env.observation_space,
                        action_space=spaces.Box(-1.0,
                                                1.0,
                                                shape=(latent_dim, )))
                    self.manager = GaussianMLPPolicy(env_spec=outer_env_spec)
                else:
                    outer_env_spec = EnvSpec(
                        observation_space=self.env.observation_space,
                        action_space=self.outer_action_space)
                    self.manager = CategoricalMLPPolicy(
                        env_spec=outer_env_spec,
                        latent_dim=latent_dim,
                    )
                # import ipdb; ipdb.set_trace()
                if pkl_path is not None and '.npz' in pkl_path:
                    param_dict = dict(
                        np.load(os.path.join(config.PROJECT_PATH, pkl_path)))
                    param_values = param_dict['params']
                    self.set_param_values(param_values)

        Serializable.quick_init(self,
                                locals())  # todo: is this where this belongs?
Exemplo n.º 18
0
def run_experiment(variant):
    # print('MuJoCo')
    # env = normalize(GymEnv('HalfCheetah-v1'))
    # -----------------------------------------------------
    print('Unity3D environment')
    env = UnityEnv('/home/recharrs/Apps/UnityEnvob3/RollerBall.x86_64', time_state=True, idx=args.idx, no_graphics=args.no_graphics)
    # -----------------------------------------------------
    obs_space = env.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low.flatten(), np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high.flatten(), np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=5000,
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=0,       # must set to 0 or it will be error
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],
    )

    algorithm.train()
Exemplo n.º 19
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )


    algorithm = DIAYN_BD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],

        # Additional params for behaviour tracking
        metric=variant['metric'],
        env_id=variant['prefix'],
        eval_freq=variant['eval_freq'],
        log_dir=get_logdir(args, variant),

    )

    algorithm.train()
	def spec(self):
		return EnvSpec(
			observation_space=self.env_agents[0].observation_space,
			action_space=self.env_agents[0].action_space,
		)
Exemplo n.º 21
0
 def spec(self):
     return EnvSpec(
         observation_space=self.observation_space,
         action_space=self.action_space,
     )
Exemplo n.º 22
0
    def __init__(self,
                 setup='dxl_tracker_default',
                 idn=9,
                 baudrate=1000000,
                 obs_history=2,
                 dt=0.01,
                 gripper_dt=0.006,
                 rllab_box=False,
                 episode_length_step=None,
                 episode_length_time=4,
                 dof=1,
                 max_torque_mag=300,
                 control_type='torque',
                 target_type='position',
                 reset_type='zero',
                 reward_type='linear',
                 delay=0,
                 dxl_dev_path='None',
                 max_velocity=5,
                 history_dt=None,
                 use_ctypes_driver=True,
                 **kwargs
                 ):
        """ Inits DxlTracker1DEnv class with task and servo specific parameters.

        Args:
            setup: A dictionary containing DXL tracker task specifications,
                such as bounding box dimensions, joint angle ranges and max load.
            idn: An integer representing the DXL ID number
            baudrate: An integer representing a baudrate to connect at
            obs_history: An integer number of sensory packets concatenated
                into a single observation vector
            dt: A float specifying duration of an environment time step
                in seconds.
            gripper_dt: A float representing DXLCommunicator cycle time
            rllab_box: A bool specifying whether to wrap environment
                action and observation spaces into an RllabBox object
                (required for off-the-shelf rllab algorithms implementations).
            episode_length_time: A float duration of an episode defined
                in seconds
            episode_length_step: An integer duration of en episode
                defined in environment steps.
            dof: an integer number of degrees of freedom
            max_torque_mag: An integer representing max possible torque command
                to be sent to the DXL devive
            control_type:
            target_type: A string specifying in what space to provide
                target coordinates, either "position" for Cartesian space
                or "angle" for joints angles space.
            reset_type: A string specifying whether to reset the arm to a
                fixed position or to a random position.
            reward_type: A string specifying the reward function,
                (e.g.,  "linear" for - d_t)
            delay: A float specifying artificial observation delay in seconds
            dxl_dev_path: A string containing the serial port address
                (e.g., /dev/ttyACM0 or /dev/ttyUSB0 on linux)
            max_velocity: A float representing the max possible velocity command
                to be sent to the DXL device
            history_dt: A float that indicates that chooses last action as action chosen
                at t - history_dt
            use_ctypes_driver: A bool. Setting it to True chooses CType-based driver.
                We found the CType-based driver to provide substantially more timely
                and precise communication compared to the pyserial-based one.
            **kwargs: Keyword arguments
        """

        self.max_temperature = 60
        self.cool_down_temperature = 50
        self.obs_history = obs_history
        self.dt = dt
        self.gripper_dt = gripper_dt

        self.max_torque_mag = np.array([max_torque_mag])
        self.max_velocity = np.array([max_velocity])

        if rllab_box:
            from rllab.spaces import Box as RlBox  # use this for rllab TRPO
            from rllab.envs.env_spec import EnvSpec
            Box = RlBox
            self._spec = EnvSpec(self.observation_space, self.action_space)
        else:
            from gym.spaces import Box as GymBox  # use this for baselines algos
            Box = GymBox

        if control_type not in ['torque']:
            raise NotImplementedError('{} control not implemented'.format(control_type))
        self.control_type = control_type

        if target_type not in ['position']:
            raise NotImplementedError('{} target not implemented'.format(target_type))
        self.target_type = target_type

        if reset_type not in ['zero', 'random' ,'none']:
            raise NotImplementedError('{} reset not implemented'.format(reset_type))
        self.reset_type = reset_type

        if reward_type not in ['linear']:
            raise NotImplementedError('{} reward not implemented'.format(reward_type))
        self.reward_type = reward_type

        if control_type == 'torque':
            self.action_low = -self.max_torque_mag
            self.action_high = +self.max_torque_mag
        elif control_type == 'velocity':
            self.action_low = -self.max_velocity
            self.action_high = +self.max_velocity

        if setup not in setups:
            raise NotImplementedError('Config not found')
        self.angle_low = setups[setup]['angles_low'][0]
        self.angle_high = setups[setup]['angles_high'][0]

        # Load value for detecting a closed gripper during reset
        self.high_load = setups[setup]['high_load'][0]

        self._present_pos_ = np.zeros((obs_history, 1))

        self._observation_space = Box(
            low=np.array(
                # list(0*np.ones(self.obs_history))  # torque enable
                # + list(0*np.ones(self.obs_history))  # alarm led
                # + list(0*np.ones(self.obs_history))  # led
                list(-pi * np.ones(self.obs_history))  # present position
                + list(self.angle_low * np.ones(2))  # target position
                + list(-np.inf * np.ones(self.obs_history))  # present speed
                # + list(-np.inf*np.ones(self.obs_history))  # present load
                # + list(0 * np.ones(self.obs_history))  # temperature
                # + list(0*np.ones(self.obs_history))  # registered
                # + list(0*np.ones(self.obs_history))  # moving
                # + list(-np.inf * np.ones(self.obs_history))  # current
                # + list(-np.inf*np.ones(self.obs_history))  # voltage
                + list(self.action_low * np.ones(self.obs_history))  # last action
            ),
            high=np.array(
                # list(1 * np.ones(self.obs_history))  # torque enable
                # + list(128 * np.ones(self.obs_history))  # alarm led
                # + list(1 * np.ones(self.obs_history))  # led
                list(pi * np.ones(self.obs_history))  # present position
                + list(self.angle_high * np.ones(2))  # target position
                + list(+np.inf * np.ones(self.obs_history))  # present speed
                # + list(+np.inf * np.ones(self.obs_history))  # present load
                # + list(255 * np.ones(self.obs_history))  # temperature
                # + list(1 * np.ones(self.obs_history))  # registered
                # + list(1 * np.ones(self.obs_history))  # moving
                # + list(+np.inf * np.ones(self.obs_history))  # current
                # + list(+np.inf * np.ones(self.obs_history))  # voltage
                + list(self.action_high * np.ones(self.obs_history))  # last action
            )
        )
        self._action_space = Box(low=self.action_low, high=self.action_high)

        self._comm_name = 'DxlTracker1D'
        self._dxl_dev_path = dxl_dev_path
        communicator_setups = {
            self._comm_name: {
                'Communicator': gcomm.DXLCommunicator,
                'num_sensor_packets': obs_history,
                'kwargs': {
                    'idn': idn,
                    'baudrate': baudrate,
                    'sensor_dt': gripper_dt,
                    'device_path': self._dxl_dev_path,
                    'use_ctypes_driver': use_ctypes_driver,
                }
            }
        }
        super(DxlTracker1DEnv, self).__init__(
            communicator_setups=communicator_setups,
            action_dim=1,
            observation_dim=self.observation_space.shape[0],
            dt=dt,
            **kwargs
        )

        read_block = dxl_mx64.MX64.subblock('version_0', 'goal_acceleration', ret_dxl_type=use_ctypes_driver)
        self.regnames = [reg.name for reg in read_block]
        self.reg_index = dict(zip(self.regnames, range(len(self.regnames))))

        self.episode_steps = Value('i', 0)
        if episode_length_step is not None:
            assert episode_length_time is None
            self.episode_length_step = episode_length_step
            self.episode_length_time = episode_length_step * dt
        elif episode_length_time is not None:
            assert episode_length_step is None
            self.episode_length_time = episode_length_time
            self.episode_length_step = int(episode_length_time / dt)
        else:
            # TODO: should we allow a continuous behaviour case here, with no episodes?
            print("episode_length_time or episode_length_step needs to be set")
            raise AssertionError

        # Task Parameters
        self.obs_history = obs_history
        self.dof = dof
        self.delay = delay
        self.pos_range = self.angle_high - self.angle_low

        if history_dt is not None:
            self.history_dt = history_dt
            self.dt_ratio = int(history_dt/self.gripper_dt)
        else:
            self.dt_ratio = int(self.dt/self.gripper_dt)
        self.comm_episode_length_step = self.episode_length_step * self.dt_ratio

        # Default initialization
        target_pos = np.random.uniform(low=self.angle_low, high=self.angle_high)
        self.pos_range = self.angle_high - self.angle_low
        self.reset_pos_center = self.angle_high - (self.pos_range / 2.)
        self.action_range = self.action_high - self.action_low

        self._reward_ = Value('d', 0.0)
        self._reset_pos_ = Value('d', self.reset_pos_center)
        self._present_pos_ = np.frombuffer(Array('f', self.obs_history).get_obj(), dtype='float32')
        self._target_pos_ = Value('d', target_pos)
        self._temperature_ = [0] * self.obs_history
        self._action_history = deque([0] * (self.obs_history + 1), self.obs_history + 1)

        # Generate trajectory
        self.generate_trajectory()

        # Tell the dxl to do nothing (overwritting previous command)
        self.nothing_packet = np.zeros(self._actuator_comms[self._comm_name].actuator_buffer.array_len)

        # PID control gains for reset
        self.kp = 161.1444  # Proportional gain
        self.ki = 0  # Integral gain
        self.kd = 0  # Derivative gain
    def __init__(
            self,
            env_spec,
            env,  # the inner one, I believe
            pkl_path=None,  # for the entire hierarchical policy
            snn_pkl_path=None,
            snn_json_path=None,
            manager_pkl_path=None,  # default is to initialize a new manager from scratch
            max_period=10,  # possible periods
            latent_dim=6,
            bilinear_integration=True,
            trainable_snn=True,
            trainable_manager=True,
            hidden_sizes_snn=(64, 64),
            hidden_sizes_selector=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.periods = np.arange(1, max_period + 1)
        assert len(self.periods) > 0
        self.curr_period = self.periods[0]
        self.max_period = max(self.periods)
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None  # something
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager
        self.random_period = True
        self.fake_env = PeriodVaryingEnv(env)

        if pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy

            # following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            # env spec that includes the extra parameter for time
            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=self.fake_env.spec,
                env=self.fake_env,
                pkl_path=snn_pkl_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector
            )

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)
                outer_env_spec = EnvSpec(observation_space=self.fake_env.observation_space,
                                         action_space=self.outer_action_space)
                self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )

        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        Serializable.quick_init(self, locals())  # todo: ask if this fixes my problem