def __init__(self, desc='two-state', map_id=None):
     self._map_id = map_id
     Serializable.quick_init(self, locals())
     if isinstance(desc, str):
         desc = MAPS[desc]
     self.desc_choices = desc
     self.reset()
Пример #2
0
 def __init__(self, *args, **kwargs):
     """    Constants:
     omega is always 0, and set a constant for background noise
     """
     
     self.Om = np.array([[1,0,0],[0,1,0],[0,0,1]])
     self.background = 2.0 #background noise
     
     """    Variables:
     These variables will be read along with the action:
     two_theta: detector's rotation about the z-axis -- assume elastic scattering, so omega is always 0
     theta: the angle at which our neutrons strike the plane
     
     These variables are the two dimensions of our problem
     chi: outer ring's rotation about the x-axis
     phi: rotation of the eulerian cradle, varies between z- and y-axis rotation depending on how much chi rotated
     
     """
     
     self.max_two_theta = 180
     self.max_chi = 90
     self.max_phi = 360
     self.min_chi = -90
     self.min_phi = 0
     self.hit = 0
     #Set up hkl and all actions
     super(UBEnv, self).__init__(self.model_path("UB.xml.mako"),*args, **kwargs)
     
     #Two independent bodies
     self.ring = find_body(self.world, "ring") #chi
     self.eu_cradle = find_body(self.world, "eu_cradle") #phi
     self.detector = find_body(self.world, "detector") #theta
     self.pivot = find_joint(self.world, "angular_axis") #pivot that enables angular movement
     Serializable.__init__(self, *args, **kwargs)    
Пример #3
0
    def __init__(
            self,
            epsilon=0.5,
            L2_reg_dual=0.,  # 1e-5,
            L2_reg_loss=0.,
            max_opt_itr=50,
            optimizer=scipy.optimize.fmin_l_bfgs_b,
            **kwargs):
        """

        :param epsilon: Max KL divergence between new policy and old policy.
        :param L2_reg_dual: Dual regularization
        :param L2_reg_loss: Loss regularization
        :param max_opt_itr: Maximum number of batch optimization iterations.
        :param optimizer: Module path to the optimizer. It must support the same interface as
        scipy.optimize.fmin_l_bfgs_b.
        :return:
        """
        Serializable.quick_init(self, locals())
        super(REPS, self).__init__(**kwargs)
        self.epsilon = epsilon
        self.L2_reg_dual = L2_reg_dual
        self.L2_reg_loss = L2_reg_loss
        self.max_opt_itr = max_opt_itr
        self.optimizer = optimizer
        self.opt_info = None
Пример #4
0
    def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim,),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Пример #5
0
 def __init__(self,
              env,
              obs_noise=1e-1,
              ):
     super(NoisyObservationEnv, self).__init__(env)
     Serializable.quick_init(self, locals())
     self.obs_noise = obs_noise
Пример #6
0
    def __init__(self, env, ma_mode):
        Serializable.quick_init(self, locals())

        self.env = env
        if hasattr(env, 'id'):
            self.env_id = env.id
        else:
            self.env_id = 'MA-Wrapper-v0'

        if ma_mode == 'centralized':
            obsfeat_space = convert_gym_space(env.agents[0].observation_space,
                                              n_agents=len(env.agents))
            action_space = convert_gym_space(env.agents[0].action_space, n_agents=len(env.agents))
        elif ma_mode in ['decentralized', 'concurrent']:
            obsfeat_space = convert_gym_space(env.agents[0].observation_space, n_agents=1)
            action_space = convert_gym_space(env.agents[0].action_space, n_agents=1)

        else:
            raise NotImplementedError

        self._observation_space = obsfeat_space
        self._action_space = action_space
        if hasattr(env, 'timestep_limit'):
            self._horizon = env.timestep_limit
        else:
            self._horizon = 250
Пример #7
0
 def __init__(self, obj, method_name, args, kwargs):
     self._serializable_initialized = False
     Serializable.quick_init(self, locals())
     self.obj = obj
     self.method_name = method_name
     self.args = args
     self.kwargs = kwargs
 def __init__(self, goal_vel=None, *args, **kwargs):
     self.goal_vel = goal_vel
     super(HalfCheetahEnvRandDirec, self).__init__(*args, **kwargs)
     self.goal_vel = goal_vel
     Serializable.__init__(self, *args, **kwargs)
     self.goal_vel = goal_vel
     self.reset(reset_args=goal_vel)
Пример #9
0
    def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True,
                 force_reset=False):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)
        self.env = env
        self.env_id = env.spec.id

        monitor_manager.logger.setLevel(logging.WARNING)

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        self._action_space = convert_gym_space(env.action_space)
        self._horizon = env.spec.timestep_limit
        self._log_dir = log_dir
        self._force_reset = force_reset
Пример #10
0
 def __init__(
         self,
         env,
         policy,
         n_itr=500,
         max_path_length=500,
         discount=0.99,
         sigma0=1.,
         batch_size=None,
         plot=False,
         **kwargs
 ):
     """
     :param n_itr: Number of iterations.
     :param max_path_length: Maximum length of a single rollout.
     :param batch_size: # of samples from trajs from param distribution, when this
     is set, n_samples is ignored
     :param discount: Discount.
     :param plot: Plot evaluation run after each iteration.
     :param sigma0: Initial std for param dist
     :return:
     """
     Serializable.quick_init(self, locals())
     self.env = env
     self.policy = policy
     self.plot = plot
     self.sigma0 = sigma0
     self.discount = discount
     self.max_path_length = max_path_length
     self.n_itr = n_itr
     self.batch_size = batch_size
Пример #11
0
 def __init__(self, name, max_opt_itr=20, callback=None):
     Serializable.quick_init(self, locals())
     self._name = name
     self._max_opt_itr = max_opt_itr
     self._opt_fun = None
     self._target = None
     self._callback = callback
Пример #12
0
 def __init__(
         self,
         ctrl_cost_coeff=1e-2,
         *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     super(SwimmerEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
    def __init__(
        self, cg_iters=10, reg_coeff=1e-5, subsample_factor=0.1, backtrack_ratio=0.8, max_backtracks=15, debug_nan=False
    ):
        """

        :param cg_iters: The number of CG iterations used to calculate A^-1 g
        :param reg_coeff: A small value so that A -> A + reg*I
        :param subsample_factor: Subsampling factor to reduce samples when using "conjugate gradient. Since the
        computation time for the descent direction dominates, this can greatly reduce the overall computation time.
        :param debug_nan: if set to True, NanGuard will be added to the compilation, and ipdb will be invoked when
        nan is detected
        :return:
        """
        Serializable.quick_init(self, locals())
        self._cg_iters = cg_iters
        self._reg_coeff = reg_coeff
        self._subsample_factor = subsample_factor
        self._backtrack_ratio = backtrack_ratio
        self._max_backtracks = max_backtracks

        self._opt_fun = None
        self._target = None
        self._max_constraint_val = None
        self._constraint_name = None
        self._debug_nan = debug_nan
Пример #14
0
    def __init__(
            self,
            update_method=lasagne.updates.adam,
            learning_rate=1e-3,
            max_epochs=1000,
            tolerance=1e-6,
            batch_size=32,
            callback=None,
            verbose=False,
            **kwargs):
        """

        :param max_epochs:
        :param tolerance:
        :param update_method:
        :param batch_size: None or an integer. If None the whole dataset will be used.
        :param callback:
        :param kwargs:
        :return:
        """
        Serializable.quick_init(self, locals())
        self._opt_fun = None
        self._target = None
        self._callback = callback
        update_method = partial(update_method, learning_rate=learning_rate)
        self._update_method = update_method
        self._max_epochs = max_epochs
        self._tolerance = tolerance
        self._batch_size = batch_size
        self._verbose = verbose
Пример #15
0
 def __init__(self, regressors):
     """
     :param regressors: List of individual regressors
     """
     Serializable.quick_init(self, locals())
     self.regressors = regressors
     self.output_dims = [x.output_dim for x in regressors]
    def __init__(
            self,
            name,
            max_opt_itr=20,
            initial_penalty=1.0,
            min_penalty=1e-2,
            max_penalty=1e6,
            increase_penalty_factor=2,
            decrease_penalty_factor=0.5,
            max_penalty_itr=10,
            adapt_penalty=True):
        Serializable.quick_init(self, locals())
        self._name = name
        self._max_opt_itr = max_opt_itr
        self._penalty = initial_penalty
        self._initial_penalty = initial_penalty
        self._min_penalty = min_penalty
        self._max_penalty = max_penalty
        self._increase_penalty_factor = increase_penalty_factor
        self._decrease_penalty_factor = decrease_penalty_factor
        self._max_penalty_itr = max_penalty_itr
        self._adapt_penalty = adapt_penalty

        self._opt_fun = None
        self._target = None
        self._max_constraint_val = None
        self._constraint_name = None
Пример #17
0
 def __init__(self, mdp_cls, mdp_args):
     Serializable.quick_init(self, locals())
     self.mdp_cls = mdp_cls
     self.mdp_args = dict(mdp_args)
     self.mdp_args["template_args"] = dict(noise=True)
     mdp = self.gen_mdp()
     super(IdentificationEnv, self).__init__(mdp)
Пример #18
0
 def __init__(
         self,
         observation_space,
         action_space):
     Serializable.quick_init(self, locals())
     self._observation_space = observation_space
     self._action_space = action_space
Пример #19
0
    def __init__(self, goal_reward=10, actuation_cost_coeff=30,
                 distance_cost_coeff=1, init_sigma=0.1):
        super().__init__()
        Serializable.quick_init(self, locals())

        self.dynamics = PointDynamics(dim=2, sigma=0)
        self.init_mu = np.zeros(2, dtype=np.float32)
        self.init_sigma = init_sigma
        self.goal_positions = np.array(
            [
                [5, 0],
                [-5, 0],
                [0, 5],
                [0, -5]
            ],
            dtype=np.float32
        )
        self.goal_threshold = 1.
        self.goal_reward = goal_reward
        self.action_cost_coeff = actuation_cost_coeff
        self.distance_cost_coeff = distance_cost_coeff
        self.xlim = (-7, 7)
        self.ylim = (-7, 7)
        self.vel_bound = 1.
        self.reset()
        self.observation = None

        self._ax = None
        self._env_lines = []
        self.fixed_plots = None
        self.dynamic_plots = []
Пример #20
0
    def __init__(self, env_spec, obs_pl, action, scope_name=None):
        Serializable.quick_init(self, locals())

        self._obs_pl = obs_pl
        self._action = action
        self._scope_name = (tf.get_variable_scope().name
                            if not scope_name else scope_name)
        super(NNPolicy, self).__init__(env_spec)
Пример #21
0
 def __init__(self, env_spec, max_sigma=1.0, min_sigma=0.1, decay_period=1000000):
     assert isinstance(env_spec.action_space, Box)
     assert len(env_spec.action_space.shape) == 1
     Serializable.quick_init(self, locals())
     self._max_sigma = max_sigma
     self._min_sigma = min_sigma
     self._decay_period = decay_period
     self._action_space = env_spec.action_space
Пример #22
0
    def __setstate__(self, state):
        """Set Serializable state fo the RLAlgorithm instance."""

        Serializable.__setstate__(self, state)
        self.qf.set_param_values(state['qf-params'])
        self.policy.set_param_values(state['policy-params'])
        self.pool.__setstate__(state['pool'])
        self.env.__setstate__(state['env'])
 def __init__(
         self,
         ctrl_cost_coeff=1e-2,
         *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     self._goal_vel = None
     super(SwimmerRandGoalEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
Пример #24
0
    def __init__(self, *inputs, name, hidden_layer_sizes):
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        self._name = name
        self._inputs = inputs
        self._layer_sizes = list(hidden_layer_sizes) + [1]

        self._output = self._output_for(*self._inputs)
Пример #25
0
 def __init__(
         self,
         alive_coeff=1,
         ctrl_cost_coeff=0.01,
         *args, **kwargs):
     self.alive_coeff = alive_coeff
     self.ctrl_cost_coeff = ctrl_cost_coeff
     super(HopperEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
Пример #26
0
 def __init__(self,
              env,
              action_delay=3,
              ):
     assert action_delay > 0, "Should not use this env transformer"
     super(DelayedActionEnv, self).__init__(env)
     Serializable.quick_init(self, locals())
     self.action_delay = action_delay
     self._queued_actions = None
Пример #27
0
 def __init__(self, max_opt_itr=20, batch_size=32, cg_batch_size=100, callback=None):
     Serializable.quick_init(self, locals())
     self._max_opt_itr = max_opt_itr
     self._opt_fun = None
     self._target = None
     self._batch_size = batch_size
     self._cg_batch_size = cg_batch_size
     self._hf_optimizer = None
     self._callback = callback
Пример #28
0
    def __init__(self, desc_str='4x4', max_traj_length=10, goal_reward=10.0):
        Serializable.quick_init(self, locals())
        self.desc_str = desc_str # Map will be loaded in `self.reset`
        self.max_traj_length = max_traj_length

        self.n_row, self.n_col = np.array(map(list, self._fetch_map())).shape

        self.state = None
        self.goal_reward = goal_reward
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.relu,
            action_merge_layer=-2,
            output_nonlinearity=None,
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
        l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")

        n_layers = len(hidden_sizes) + 1

        if n_layers > 1:
            action_merge_layer = \
                (action_merge_layer % n_layers + n_layers) % n_layers
        else:
            action_merge_layer = 1

        l_hidden = l_obs

        for idx, size in enumerate(hidden_sizes):
            if bn:
                l_hidden = batch_norm(l_hidden)

            if idx == action_merge_layer:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % (idx + 1)
            )

        if action_merge_layer == n_layers:
            l_hidden = L.ConcatLayer([l_hidden, l_action])

        l_output = L.DenseLayer(
            l_hidden,
            num_units=1,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        output_var = L.get_output(l_output, deterministic=True)

        self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action
        self._output_nonlinearity = output_nonlinearity

        LayersPowered.__init__(self, [l_output])
Пример #30
0
 def __init__(self, *args, **kwargs):
     super(CartpoleSwingupEnvX, self).__init__(
         self.model_path("cartpole.xml.mako"),
         *args, **kwargs
     )
     self.max_cart_pos = 3
     self.max_reward_cart_pos = 3
     self.cart = find_body(self.world, "cart")
     self.pole = find_body(self.world, "pole")
     Serializable.__init__(self, *args, **kwargs)
Пример #31
0
    def __init__(
            self,
            # goal_generator,
            n_bins=20,
            sensor_range=10.,
            sensor_span=math.pi,
            maze_id=0,
            length=1,
            maze_height=0.5,
            maze_size_scaling=2,
            coef_inner_rew=1.,  # a coef of 0 gives no reward to the maze from the wrapped env.
            # goal_rew=1.,  # reward obtained when reaching the goal
        include_maze_obs=False,
            *args,
            **kwargs):
        Serializable.quick_init(self, locals())
        self._n_bins = n_bins
        self._sensor_range = sensor_range
        self._sensor_span = sensor_span
        self._maze_id = maze_id
        self.length = length
        self.coef_inner_rew = coef_inner_rew
        # self.goal_rew = goal_rew
        self.include_maze_obs = include_maze_obs

        model_cls = self.__class__.MODEL_CLASS
        if model_cls is None:
            raise "MODEL_CLASS unspecified!"
        xml_path = osp.join(MODEL_DIR, model_cls.FILE)
        tree = ET.parse(xml_path)
        worldbody = tree.find(".//worldbody")

        self.MAZE_HEIGHT = height = maze_height
        self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
        self.MAZE_STRUCTURE = structure = construct_maze(maze_id=self._maze_id,
                                                         length=self.length)
        if self._maze_id == 0:
            self.LINEARIZED = MazeEnv.MAZE_0
        elif self._maze_id == 11:
            self.LINEARIZED = MazeEnv.MAZE_11
        elif self._maze_id == 13:
            self.LINEARIZED = MazeEnv.MAZE_13
        elif self._maze_id == 14:
            self.LINEARIZED = MazeEnv.MAZE_14
        else:
            self.LINEARIZED = None

        torso_x, torso_y = self._find_robot()
        self._init_torso_x = torso_x
        self._init_torso_y = torso_y

        for i in range(len(structure)):
            for j in range(len(structure[0])):
                if str(structure[i][j]) == '1':
                    # offset all coordinates so that robot starts at the origin
                    ET.SubElement(
                        worldbody,
                        "geom",
                        name="block_%d_%d" % (i, j),
                        pos="%f %f %f" %
                        (j * size_scaling - torso_x, i * size_scaling -
                         torso_y, height / 2 * size_scaling),
                        size="%f %f %f" %
                        (0.5 * size_scaling, 0.5 * size_scaling,
                         height / 2 * size_scaling),
                        type="box",
                        material="",
                        contype="1",
                        conaffinity="1",
                        rgba="0.4 0.4 0.4 0.5")

        torso = tree.find(".//body[@name='torso']")
        geoms = torso.findall(".//geom")
        for geom in geoms:
            if 'name' not in geom.attrib:
                raise Exception("Every geom of the torso must have a name "
                                "defined")

        segments = []

        # Get all line segments of the goal and the obstacles
        for i in range(len(structure)):
            for j in range(len(structure[0])):
                if structure[i][j] == 1 or structure[i][j] == 'g':
                    cx = j * size_scaling - self._init_torso_x
                    cy = i * size_scaling - self._init_torso_y
                    x1 = cx - 0.5 * size_scaling
                    x2 = cx + 0.5 * size_scaling
                    y1 = cy - 0.5 * size_scaling
                    y2 = cy + 0.5 * size_scaling
                    struct_segments = [
                        ((x1, y1), (x2, y1)),
                        ((x2, y1), (x2, y2)),
                        ((x2, y2), (x1, y2)),
                        ((x1, y2), (x1, y1)),
                    ]
                    for seg in struct_segments:
                        segments.append(
                            dict(
                                segment=seg,
                                type=structure[i][j],
                            ))
        self.segments = segments

        if self.__class__.MAZE_MAKE_CONTACTS:
            contact = ET.SubElement(tree.find("."), "contact")
            for i in range(len(structure)):
                for j in range(len(structure[0])):
                    if str(structure[i][j]) == '1':
                        for geom in geoms:
                            ET.SubElement(contact,
                                          "pair",
                                          geom1=geom.attrib["name"],
                                          geom2="block_%d_%d" % (i, j))

        _, file_path = tempfile.mkstemp(text=True)
        tree.write(
            file_path
        )  # here we write a temporal file with the robot specifications. Why not the original one??

        self._goal_range = self._find_goal_range()
        self._cached_segments = None

        inner_env = model_cls(file_path=file_path, *args,
                              **kwargs)  # file to the robot specifications
        ProxyEnv.__init__(
            self, inner_env)  # here is where the robot env will be initialized
    def __init__(self,
                 env_spec,
                 name='qnet',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.relu,
                 action_merge_layer=-2,
                 output_nonlinearity=None,
                 hidden_W_init=L.XavierUniformInitializer(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_W_init=L.XavierUniformInitializer(),
                 output_b_init=tf.zeros_initializer(),
                 bn=False):
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):
            l_obs = L.InputLayer(shape=(None,
                                        env_spec.observation_space.flat_dim),
                                 name="obs")
            l_action = L.InputLayer(shape=(None,
                                           env_spec.action_space.flat_dim),
                                    name="actions")

            n_layers = len(hidden_sizes) + 1

            if n_layers > 1:
                action_merge_layer = \
                    (action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            l_hidden = l_obs

            for idx, size in enumerate(hidden_sizes):
                if bn:
                    l_hidden = L.batch_norm(l_hidden)

                if idx == action_merge_layer:
                    l_hidden = L.ConcatLayer([l_hidden, l_action])

                l_hidden = L.DenseLayer(l_hidden,
                                        num_units=size,
                                        W=hidden_W_init,
                                        b=hidden_b_init,
                                        nonlinearity=hidden_nonlinearity,
                                        name="h%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_output = L.DenseLayer(l_hidden,
                                    num_units=1,
                                    W=output_W_init,
                                    b=output_b_init,
                                    nonlinearity=output_nonlinearity,
                                    name="output")

            #output_var = L.get_output(l_output, deterministic=True).flatten()
            output_var = tf.reshape(L.get_output(l_output, deterministic=True),
                                    (-1, ))

            self._f_qval = tensor_utils.compile_function(
                [l_obs.input_var, l_action.input_var], output_var)
            self._output_layer = l_output
            self._obs_layer = l_obs
            self._action_layer = l_action
            self._output_nonlinearity = output_nonlinearity

            LayersPowered.__init__(self, [l_output])
Пример #33
0
    def __init__(
        self,
        name,
        input_shape,
        output_dim,
        prob_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        optimizer=None,
        tr_optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):
            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            if prob_network is None:
                prob_network = MLP(input_shape=input_shape,
                                   output_dim=output_dim,
                                   hidden_sizes=hidden_sizes,
                                   hidden_nonlinearity=hidden_nonlinearity,
                                   output_nonlinearity=tf.nn.softmax,
                                   name="prob_network")

            l_prob = prob_network.output_layer

            LayersPowered.__init__(self, [l_prob])

            xs_var = prob_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    shape=[None, output_dim],
                                    name="ys")
            old_prob_var = tf.placeholder(dtype=tf.float32,
                                          shape=[None, output_dim],
                                          name="old_prob")

            x_mean_var = tf.get_variable(name="x_mean",
                                         shape=(1, ) + input_shape,
                                         initializer=tf.constant_initializer(
                                             0., dtype=tf.float32))
            x_std_var = tf.get_variable(name="x_std",
                                        shape=(1, ) + input_shape,
                                        initializer=tf.constant_initializer(
                                            1., dtype=tf.float32))

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            prob_var = L.get_output(
                l_prob, {prob_network.input_layer: normalized_xs_var})

            old_info_vars = dict(prob=old_prob_var)
            info_vars = dict(prob=prob_var)

            dist = self._dist = Categorical(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1),
                                                   output_dim)

            self.prob_network = prob_network
            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_prob = tensor_utils.compile_function([xs_var], prob_var)
            self.l_prob = l_prob

            self.optimizer.update_opt(loss=loss,
                                      target=self,
                                      network_outputs=[prob_var],
                                      inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss,
                                         target=self,
                                         network_outputs=[prob_var],
                                         inputs=[xs_var, ys_var, old_prob_var],
                                         leq_constraint=(mean_kl, step_size))

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
Пример #34
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, ),
                 state_include_action=True,
                 hidden_nonlinearity=NL.tanh,
                 learn_std=True,
                 init_std=1.0,
                 output_nonlinearity=None,
                 **kwargs):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        Serializable.quick_init(self, locals())
        super(GaussianRNNPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        self.n_hidden = hidden_sizes[0]
        mean_network = self.create_mean_network(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            **kwargs)

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        l_step_log_std = ParamLayer(
            mean_network.step_input_layer,
            num_units=action_dim,
            param=l_log_std.param,
            name="step_output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._l_log_std = l_log_std
        self._state_include_action = state_include_action

        self._f_step_mean_std = ext.compile_function(
            [
                mean_network.step_input_layer.input_var,
                mean_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                mean_network.step_output_layer, l_step_log_std,
                mean_network.step_hidden_layer
            ]))

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentDiagonalGaussian(action_dim)

        self.reset()
        self.greedy = False
        LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
Пример #35
0
    def __init__(self,
                 name,
                 env_spec,
                 hidden_sizes=(32, 32),
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_hidden_sizes=(32, 32),
                 min_std=1e-6,
                 std_hidden_nonlinearity=tf.nn.tanh,
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=None,
                 mean_network=None,
                 std_network=None,
                 std_parametrization='exp'):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :return:
        """
        Serializable.quick_init(self, locals())
        # assert isinstance(env_spec.action_space, Box)

        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            # create network
            if mean_network is None:
                mean_network = MLP(
                    name="mean_network",
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )
            self._mean_network = mean_network

            l_mean = mean_network.output_layer
            obs_var = mean_network.input_layer.input_var

            if std_network is not None:
                l_std_param = std_network.output_layer
            else:
                if adaptive_std:
                    std_network = MLP(
                        name="std_network",
                        input_shape=(obs_dim, ),
                        input_layer=mean_network.input_layer,
                        output_dim=action_dim,
                        hidden_sizes=std_hidden_sizes,
                        hidden_nonlinearity=std_hidden_nonlinearity,
                        output_nonlinearity=None,
                    )
                    l_std_param = std_network.output_layer
                else:
                    if std_parametrization == 'exp':
                        init_std_param = np.log(init_std)
                    elif std_parametrization == 'softplus':
                        init_std_param = np.log(np.exp(init_std) - 1)
                    else:
                        raise NotImplementedError
                    l_std_param = L.ParamLayer(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=tf.constant_initializer(init_std_param),
                        name="output_std_param",
                        trainable=learn_std,
                    )

            self.std_parametrization = std_parametrization

            if std_parametrization == 'exp':
                min_std_param = np.log(min_std)
            elif std_parametrization == 'softplus':
                min_std_param = np.log(np.exp(min_std) - 1)
            else:
                raise NotImplementedError

            self.min_std_param = min_std_param

            # mean_var, log_std_var = L.get_output([l_mean, l_std_param])
            #
            # if self.min_std_param is not None:
            #     log_std_var = tf.maximum(log_std_var, np.log(min_std))
            #
            # self._mean_var, self._log_std_var = mean_var, log_std_var

            self._l_mean = l_mean
            self._l_std_param = l_std_param

            self._dist = DiagonalGaussian(action_dim)

            LayersPowered.__init__(self, [l_mean, l_std_param])
            super(GaussianMLPPolicy, self).__init__(env_spec)

            dist_info_sym = self.dist_info_sym(
                mean_network.input_layer.input_var, dict())
            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            self._f_dist = tensor_utils.compile_function(
                inputs=[obs_var],
                outputs=[mean_var, log_std_var],
            )
Пример #36
0
 def __init__(self, goal_args=('noisy', (.6,.2), .1), frame_skip=5, *args, **kwargs):
     self.goal_args = goal_args
     
     super(PickerEnv, self).__init__(frame_skip=frame_skip, *args, **kwargs)
     Serializable.__init__(self, goal_args, frame_skip, *args, **kwargs)
    def __init__(self, ip='127.0.0.1', port=9397):
        self._conn = ZMQConnection(ip, port)
        self.prev_action = 0.

        Serializable.quick_init(self, locals())
Пример #38
0
    def __init__(
            self,
            name,
            env_spec,
            hidden_dim=32,
            feature_network=None,
            state_include_action=True,
            hidden_nonlinearity=tf.tanh,
            gru_layer_cls=L.GRULayer,
            learn_std=True,
            init_std=1.0,
            output_nonlinearity=None,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        with tf.variable_scope(name):
            Serializable.quick_init(self, locals())
            super(GaussianGRUPolicy, self).__init__(env_spec)

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            if state_include_action:
                input_dim = obs_dim + action_dim
            else:
                input_dim = obs_dim

            l_input = L.InputLayer(
                shape=(None, None, input_dim),
                name="input"
            )

            if feature_network is None:
                feature_dim = input_dim
                l_flat_feature = None
                l_feature = l_input
            else:
                feature_dim = feature_network.output_layer.output_shape[-1]
                l_flat_feature = feature_network.output_layer
                l_feature = L.OpLayer(
                    l_flat_feature,
                    extras=[l_input],
                    name="reshape_feature",
                    op=lambda flat_feature, input: tf.reshape(
                        flat_feature,
                        tf.pack(
                            [tf.shape(input)[0], tf.shape(input)[1], feature_dim])
                    ),
                    shape_op=lambda _, input_shape: (
                        input_shape[0], input_shape[1], feature_dim)
                )

            mean_network = GRUNetwork(
                input_shape=(feature_dim,),
                input_layer=l_feature,
                output_dim=action_dim,
                hidden_dim=hidden_dim,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                gru_layer_cls=gru_layer_cls,
                name="mean_network"
            )

            l_log_std = L.ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=tf.constant_initializer(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

            l_step_log_std = L.ParamLayer(
                mean_network.step_input_layer,
                num_units=action_dim,
                param=l_log_std.param,
                name="step_output_log_std",
                trainable=learn_std,
            )

            self.mean_network = mean_network
            self.feature_network = feature_network
            self.l_input = l_input
            self.state_include_action = state_include_action

            flat_input_var = tf.placeholder(
                dtype=tf.float32, shape=(None, input_dim), name="flat_input")
            if feature_network is None:
                feature_var = flat_input_var
            else:
                feature_var = L.get_output(
                    l_flat_feature, {feature_network.input_layer: flat_input_var})

            self.f_step_mean_std = tensor_utils.compile_function(
                [
                    flat_input_var,
                    # mean_network.step_prev_hidden_layer.input_var,
                    mean_network.step_prev_state_layer.input_var
                ],
                L.get_output([
                    mean_network.step_output_layer,
                    l_step_log_std,
                    mean_network.step_hidden_layer,
                ], {mean_network.step_input_layer: feature_var})
            )

            self.l_log_std = l_log_std

            self.input_dim = input_dim
            self.action_dim = action_dim
            self.hidden_dim = hidden_dim

            self.prev_actions = None
            self.prev_hiddens = None
            self.dist = RecurrentDiagonalGaussian(action_dim)

            out_layers = [mean_network.output_layer, l_log_std, l_step_log_std]
            if feature_network is not None:
                out_layers.append(feature_network.output_layer)

            LayersPowered.__init__(self, out_layers)
    def __init__(self,
                 env_spec,
                 name='qnet',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.relu,
                 action_merge_layer=-2,
                 output_nonlinearity=None,
                 eqf_use_full_qf=False,
                 eqf_sample_size=1,
                 bn=False):
        Serializable.quick_init(self, locals())

        assert not env_spec.action_space.is_discrete
        self._env_spec = env_spec

        with tf.variable_scope(name):
            l_obs = L.InputLayer(shape=(None,
                                        env_spec.observation_space.flat_dim),
                                 name="obs")
            l_action = L.InputLayer(shape=(None,
                                           env_spec.action_space.flat_dim),
                                    name="actions")

            n_layers = len(hidden_sizes) + 1

            if n_layers > 1:
                action_merge_layer = \
                    (action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            l_hidden = l_obs

            for idx, size in enumerate(hidden_sizes):
                if bn:
                    l_hidden = batch_norm(l_hidden)

                if idx == action_merge_layer:
                    l_hidden = L.ConcatLayer([l_hidden, l_action])

                l_hidden = L.DenseLayer(l_hidden,
                                        num_units=size,
                                        nonlinearity=hidden_nonlinearity,
                                        name="h%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_output = L.DenseLayer(l_hidden,
                                    num_units=1,
                                    nonlinearity=output_nonlinearity,
                                    name="output")

            output_var = L.get_output(l_output, deterministic=True)
            output_var = tf.reshape(output_var, (-1, ))

            self._f_qval = tensor_utils.compile_function(
                [l_obs.input_var, l_action.input_var], output_var)
            self._output_layer = l_output
            self._obs_layer = l_obs
            self._action_layer = l_action
            self._output_nonlinearity = output_nonlinearity
            #This is not True according most common cases like vpg
            self.eqf_use_full_qf = eqf_use_full_qf

            self.eqf_sample_size = eqf_sample_size

            LayersPowered.__init__(self, [l_output])
    def __init__(self,
                 name,
                 input_shape,
                 output_dim,
                 mean_network=None,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=tf.identity,
                 optimizer=None,
                 use_trust_region=True,
                 step_size=0.01,
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_hidden_sizes=(32, 32),
                 std_nonlinearity=None,
                 normalize_inputs=True,
                 normalize_outputs=True,
                 subsample_factor=1.0):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):

            if optimizer is None:
                if use_trust_region:
                    optimizer = PenaltyLbfgsOptimizer("optimizer")
                else:
                    optimizer = LbfgsOptimizer("optimizer")

            self._optimizer = optimizer
            self._subsample_factor = subsample_factor

            if mean_network is None:
                #     print("Debug2, mean network is defined ehre")
                #     mean_network = L.ParamLayer(
                #         incoming=L.InputLayer(
                #             shape=(None,) + input_shape,
                #             name="input_layer"),
                #         num_units=1,
                #         param=tf.constant_initializer(-200.0),
                #         name="mean_network",
                #         trainable=True,
                #     ),
                #     print(mean_network.input_layer)
                # print("debug4", isinstance(L.InputLayer(
                #             shape=(None,) + input_shape,
                #             name="input_layer"), tuple))
                #
                # l_mean = mean_network

                mean_network = MLP(
                    name="mean_network",
                    input_shape=input_shape,
                    output_dim=output_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )

            l_mean = mean_network.output_layer

            if adaptive_std:
                l_log_std = MLP(
                    name="log_std_network",
                    input_shape=input_shape,
                    input_var=mean_network.input_layer.input_var,
                    output_dim=output_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_nonlinearity,
                    output_nonlinearity=None,
                ).output_layer
            else:
                l_log_std = L.ParamLayer(
                    mean_network.input_layer,
                    num_units=output_dim,
                    param=tf.constant_initializer(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

            LayersPowered.__init__(self, [l_mean, l_log_std])

            xs_var = mean_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    name="ys",
                                    shape=(None, output_dim))
            old_means_var = tf.placeholder(dtype=tf.float32,
                                           name="ys",
                                           shape=(None, output_dim))
            old_log_stds_var = tf.placeholder(dtype=tf.float32,
                                              name="old_log_stds",
                                              shape=(None, output_dim))

            x_mean_var = tf.Variable(np.zeros((1, ) + input_shape,
                                              dtype=np.float32),
                                     name="x_mean",
                                     trainable=False)
            x_std_var = tf.Variable(np.ones((1, ) + input_shape,
                                            dtype=np.float32),
                                    name="x_std",
                                    trainable=False)
            y_mean_var = tf.Variable(np.zeros((1, output_dim),
                                              dtype=np.float32),
                                     name="y_mean",
                                     trainable=False)
            y_std_var = tf.Variable(np.ones((1, output_dim), dtype=np.float32),
                                    name="y_std",
                                    trainable=False)

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var
            normalized_ys_var = (ys_var - y_mean_var) / y_std_var

            normalized_means_var = L.get_output(
                l_mean, {mean_network.input_layer: normalized_xs_var})
            normalized_log_stds_var = L.get_output(
                l_log_std, {mean_network.input_layer: normalized_xs_var})

            means_var = normalized_means_var * y_std_var + y_mean_var
            log_stds_var = normalized_log_stds_var + tf.log(y_std_var)

            normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
            normalized_old_log_stds_var = old_log_stds_var - tf.log(y_std_var)

            ## code added for symbolic prediction, used in constructing the meta-learning objective
            def normalized_means_var_sym(xs, params):
                inputs = OrderedDict({mean_network.input_layer: xs})
                inputs.update(params)
                return L.get_output(layer_or_layers=l_mean, inputs=inputs)

            # normalized_means_var_sym = lambda xs, params: L.get_output(layer_or_layers=l_mean, inputs=OrderedDict({mean_network.input_layer:xs}.)  #mean_network.input_layer: (xs-x_mean_var)/x_std_var,
            # normalized_log_stds_var_sym = L.get_output(l_log_std, {mean_network.input_layer: normalized_xs_var})
            means_var_sym = lambda xs, params: normalized_means_var_sym(
                xs=xs, params=params) * y_std_var + y_mean_var
            # log_stds_var = normalized_log_stds_var + tf.log(y_std_var)

            dist = self._dist = DiagonalGaussian(output_dim)

            normalized_dist_info_vars = dict(mean=normalized_means_var,
                                             log_std=normalized_log_stds_var)

            mean_kl = tf.cast(
                tf.reduce_mean(
                    dist.kl_sym(
                        dict(mean=normalized_old_means_var,
                             log_std=normalized_old_log_stds_var),
                        normalized_dist_info_vars,
                    )), tf.float32)

            # loss = - tf.cast(tf.reduce_mean(dist.log_likelihood_sym(normalized_ys_var, normalized_dist_info_vars)), tf.float32)
            loss = tf.cast(
                tf.reduce_mean(
                    tf.square(normalized_ys_var - normalized_means_var)) +
                tf.reduce_mean(tf.square(normalized_log_stds_var)), tf.float32)
            self._f_predict = tensor_utils.compile_function([xs_var],
                                                            means_var)
            self._f_pdists = tensor_utils.compile_function(
                [xs_var], [means_var, log_stds_var])
            self._l_mean = l_mean
            self._l_log_std = l_log_std

            self._f_predict_sym = means_var_sym
            self.loss_sym = loss
            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[
                    normalized_means_var, normalized_log_stds_var
                ],
            )

            if use_trust_region:
                optimizer_args["leq_constraint"] = (mean_kl, step_size)
                optimizer_args["inputs"] = [
                    xs_var, ys_var, old_means_var, old_log_stds_var
                ]
            else:
                optimizer_args["inputs"] = [xs_var, ys_var]

            self._optimizer.update_opt(**optimizer_args)

            self._use_trust_region = use_trust_region
            self._name = name

            self._normalize_inputs = normalize_inputs
            self._normalize_outputs = normalize_outputs
            self._mean_network = mean_network
            self._x_mean_var = x_mean_var
            self._x_std_var = x_std_var
            self._y_mean_var = y_mean_var
            self._y_std_var = y_std_var
Пример #41
0
 def __init__(self, *args, **kwargs):
     super(ReacherEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
 def __init__(self, *args, **kwargs):
     super(BlkCmplxObs, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
Пример #43
0
    def __init__(
        self,
        name,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=tf.nn.tanh,
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=tf.identity,
        mean_network=None,
        std_network=None,
        std_parametrization='exp',
        grad_step_size=1.0,
        stop_grad=False,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1]
        :param stop_grad: whether or not to stop the gradient through the gradient.
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim
        self.n_hidden = len(hidden_sizes)
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity
        self.input_shape = (
            None,
            obs_dim,
        )
        self.step_size = grad_step_size
        self.stop_grad = stop_grad
        if type(self.step_size) == list:
            raise NotImplementedError('removing this since it didnt work well')

        # create network
        if mean_network is None:
            self.all_params = self.create_MLP(  # TODO: this should not be a method of the policy! --> helper
                name="mean_network",
                output_dim=self.action_dim,
                hidden_sizes=hidden_sizes,
            )
            self.input_tensor, _ = self.forward_MLP(
                'mean_network',
                self.all_params,
                reuse=None  # Need to run this for batch norm
            )
            forward_mean = lambda x, params, is_train: self.forward_MLP(
                'mean_network', params, input_tensor=x, is_training=is_train)[1
                                                                              ]
        else:
            raise NotImplementedError('Not supported.')

        if std_network is not None:
            raise NotImplementedError('Not supported.')
        else:
            if adaptive_std:
                raise NotImplementedError('Not supported.')
            else:
                if std_parametrization == 'exp':
                    init_std_param = np.log(init_std)
                elif std_parametrization == 'softplus':
                    init_std_param = np.log(np.exp(init_std) - 1)
                else:
                    raise NotImplementedError
                self.all_params['std_param'] = make_param_layer(
                    num_units=self.action_dim,
                    param=tf.constant_initializer(init_std_param),
                    name="output_std_param",
                    trainable=learn_std,
                )
                forward_std = lambda x, params: forward_param_layer(
                    x, params['std_param'])
            self.all_param_vals = None

            # unify forward mean and forward std into a single function
            self._forward = lambda obs, params, is_train: (forward_mean(
                obs, params, is_train), forward_std(obs, params))

            self.std_parametrization = std_parametrization

            if std_parametrization == 'exp':
                min_std_param = np.log(min_std)
            elif std_parametrization == 'softplus':
                min_std_param = np.log(np.exp(min_std) - 1)
            else:
                raise NotImplementedError

            self.min_std_param = min_std_param

            self._dist = DiagonalGaussian(self.action_dim)

            self._cached_params = {}

            super(MAMLGaussianMLPPolicy, self).__init__(env_spec)

            dist_info_sym = self.dist_info_sym(self.input_tensor,
                                               dict(),
                                               is_training=False)
            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            # pre-update policy
            self._init_f_dist = tensor_utils.compile_function(
                inputs=[self.input_tensor],
                outputs=[mean_var, log_std_var],
            )
            self._cur_f_dist = self._init_f_dist
    def __init__(
        self,
        env_spec,
        env,
        pkl_paths=(),
        json_paths=(),
        npz_paths=(),
        trainable_old=True,
        external_selector=False,
        hidden_sizes_selector=(10, 10),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param pkl_paths: tuple/list of pkl paths
        :param json_paths: tuple/list of json paths
        :param npz_paths: tuple/list of npz paths
        :param trainable_old: Are the old policies still trainable
        :param external_selector: is the linear combination of the old policies outputs fixed externally
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        """
        # define where are the old policies to use and what to do with them:
        self.trainable_old = trainable_old  # whether to keep training the old policies loaded here
        self.pkl_paths = pkl_paths
        self.json_paths = json_paths
        self.npz_paths = npz_paths
        self.selector_dim = max(
            len(json_paths), len(pkl_paths))  # pkl could be zero if giving npz
        # if not use a selector NN here, just externally fixed selector variable:
        self.external_selector = external_selector  # whether to use the selectorNN defined here or the pre_fix_selector
        self.pre_fix_selector = np.zeros(
            (self.selector_dim)
        )  # if this is not empty when using reset() it will use this selector
        self.selector_fix = np.zeros(
            (self.selector_dim
             ))  # this will hold the selectors variable sampled in reset()
        self.shared_selector_var = theano.shared(
            self.selector_fix)  # this is for external selector! update that
        # else, describe the MLP used:
        self.hidden_sizes_selector = hidden_sizes_selector  # size of the selector NN defined here
        self.min_std = min_std
        self._set_std_to_0 = False

        self.action_dim = env_spec.action_space.flat_dim  # not checking that all the old policies have this act_dim

        self.old_hidden_sizes = []
        # assume json always given
        for json_path in self.json_paths:
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, json_path), 'r'))
            old_json_policy = data['json_args']["policy"]
            self.old_hidden_sizes.append(old_json_policy['hidden_sizes'])

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.external_selector:  # in case we want to fix the selector externally
            l_all_obs_var = L.InputLayer(
                shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, ))
            all_obs_var = l_all_obs_var.input_var
            l_selection = ParamLayer(incoming=l_all_obs_var,
                                     num_units=self.selector_dim,
                                     param=self.shared_selector_var,
                                     trainable=False)
            selection_var = L.get_output(l_selection)
        else:
            # create network with softmax output: it will be the selector!
            selector_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim, ),
                output_dim=self.selector_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = selector_network.input_layer
            all_obs_var = selector_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to selectors)
            l_selection = selector_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var,
                                start_index=None,
                                end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var,
                               start_index=self.obs_robot_dim,
                               end_index=None)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # create the action networks
        self.old_l_means = [
        ]  # I do this self in case I wanna access it from reset
        self.old_l_log_stds = []
        self.old_layers = []
        for i in range(self.selector_dim):
            mean_network = MLP(
                input_layer=l_obs_robot,
                output_dim=self.action_dim,
                hidden_sizes=self.old_hidden_sizes[i],
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name="meanMLP{}".format(i),
            )
            self.old_l_means.append(mean_network.output_layer)
            self.old_layers += mean_network.layers

            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=self.action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std{}".format(i),
                trainable=learn_std,
            )
            self.old_l_log_stds.append(l_log_std)
            self.old_layers += [l_log_std]

        if not self.trainable_old:
            for layer in self.old_layers:
                for param, tags in layer.params.items(
                ):  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_paths and self.npz_paths:
            old_params_dict = {}
            for i, npz_path in enumerate(self.npz_paths):
                params_dict = dict(
                    np.load(os.path.join(config.PROJECT_PATH, npz_path)))
                renamed_warm_params_dict = {}
                for key in params_dict.keys():
                    if key == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = params_dict[key]
                    elif 'meanMLP_' == key[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key[8:]] = params_dict[key]
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key] = params_dict[key]
            self.set_old_params(old_params_dict)

        elif self.pkl_paths:
            old_params_dict = {}
            for i, pkl_path in enumerate(self.pkl_paths):
                data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
                params = data['policy'].get_params_internal()
                for param in params:
                    if param.name == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = param.get_value()
                    elif 'meanMLP_' == param.name[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name[8:]] = param.get_value()
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name] = param.get_value()
            self.set_old_params(old_params_dict)

        # new layers actually selecting the correct output
        l_mean = SumProdLayer(self.old_l_means + [l_selection])
        l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection])
        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(self.action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec)

        self._f_old_means = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ])

        self._f_all_inputs = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ] + [selection_var])

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )
Пример #45
0
 def __init__(self, wrapped_env, action_space, observation_space):
     Serializable.quick_init(self, locals())
     super(SpecWrapperEnv, self).__init__(wrapped_env)
     self._action_space = action_space
     self._observation_space = observation_space
 def __init__(self, num_slices=1):
     Serializable.quick_init(self, locals())
     self.target = None
     self.reg_coeff = None
     self.opt_fun = None
     self._num_slices = num_slices
Пример #47
0
 def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     super(Walker2DEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
Пример #48
0
 def __init__(self):
     Serializable.quick_init(self, locals())
Пример #49
0
    def __init__(self,
                 eta=0.01,
                 alpha=0.001,
                 max_epochs=1,
                 tolerance=1e-5,
                 batch_size=32,
                 epsilon=1e-8,
                 verbose=False,
                 num_slices=1,
                 use_SGD=False,
                 scale=1.0,
                 backtrack_ratio=0.5,
                 max_backtracks=10,
                 cg_iters=10,
                 reg_coeff=1e-5,
                 subsample_factor=1.,
                 hvp_approach=None,
                 max_batch=10,
                 learning_rate=1e-3,
                 **kwargs):
        """

        :param max_epochs:
        :param tolerance:
        :param update_method:
        :param batch_size: None or an integer. If None the whole dataset will be used.
        :param cg_iters: The number of CG iterations used to calculate A^-1 g
        :param reg_coeff: A small value so that A -> A + reg*I
        :param subsample_factor: Subsampling factor to reduce samples when using "conjugate gradient. Since the
        computation time for the descent direction dominates, this can greatly reduce the overall computation time.
        :param kwargs:
        :return:
        """
        Serializable.quick_init(self, locals())
        self._eta = eta
        self._alpha = alpha
        self._opt_fun = None
        self._target = None
        self._max_epochs = max_epochs
        self._tolerance = tolerance
        self._batch_size = batch_size
        self._epsilon = epsilon
        self._verbose = verbose
        self._input_vars = None
        self._num_slices = num_slices
        self._scale = scale
        self._use_SGD = use_SGD
        self._backtrack_ratio = backtrack_ratio
        self._max_backtracks = max_backtracks
        self._max_batch = max_batch
        self._learning_rate = learning_rate

        self._cg_iters = cg_iters
        self._reg_coeff = reg_coeff
        self._subsample_factor = subsample_factor
        if hvp_approach is None:
            hvp_approach = PerlmutterHvp(num_slices)
        self._hvp_approach = hvp_approach

        logger.log('max_batch %d' % (self._max_batch))
        logger.log('mini_batch %d' % (self._batch_size))
        logger.log('cg_iters %d' % (self._cg_iters))
        logger.log('subsample_factor %f' % (self._subsample_factor))
Пример #50
0
    def __init__(self, env_params, sim_params, scenario, simulator='traci'):
        """Initialize the environment class.

        Parameters
        ----------
        env_params : flow.core.params.EnvParams
           see flow/core/params.py
        sim_params : flow.core.params.SimParams
           see flow/core/params.py
        scenario : flow.scenarios.Scenario
            see flow/scenarios/base_scenario.py
        simulator : str
            the simulator used, one of {'traci', 'aimsun'}. Defaults to 'traci'

        Raises
        ------
        flow.utils.exceptions.FatalFlowError
            if the render mode is not set to a valid value
        """
        # Invoke serializable if using rllab
        if serializable_flag:
            Serializable.quick_init(self, locals())

        self.env_params = env_params
        self.scenario = scenario
        self.sim_params = sim_params
        time_stamp = ''.join(str(time.time()).split('.'))
        if os.environ.get("TEST_FLAG", 0):
            # 1.0 works with stress_test_start 10k times
            time.sleep(1.0 * int(time_stamp[-6:]) / 1e6)
        # FIXME: this is sumo-specific
        self.sim_params.port = sumolib.miscutils.getFreeSocketPort()
        # time_counter: number of steps taken since the start of a rollout
        self.time_counter = 0
        # step_counter: number of total steps taken
        self.step_counter = 0
        # initial_state:
        #   Key = Vehicle ID,
        #   Entry = (type_id, route_id, lane_index, lane_pos, speed, pos)
        self.initial_state = {}
        self.state = None
        self.obs_var_labels = []

        # simulation step size
        self.sim_step = sim_params.sim_step

        # the simulator used by this environment
        self.simulator = simulator

        # create the Flow kernel
        self.k = Kernel(simulator=self.simulator,
                        sim_params=sim_params)

        # use the scenario class's network parameters to generate the necessary
        # scenario components within the scenario kernel
        self.k.scenario.generate_network(scenario)

        # initial the vehicles kernel using the VehicleParams object
        self.k.vehicle.initialize(deepcopy(scenario.vehicles))

        # initialize the simulation using the simulation kernel. This will use
        # the scenario kernel as an input in order to determine what network
        # needs to be simulated.
        kernel_api = self.k.simulation.start_simulation(
            scenario=self.k.scenario, sim_params=sim_params)

        # pass the kernel api to the kernel and it's subclasses
        self.k.pass_api(kernel_api)

        # the available_routes variable contains a dictionary of routes
        # vehicles can traverse; to be used when routes need to be chosen
        # dynamically
        self.available_routes = self.k.scenario.rts

        # store the initial vehicle ids
        self.initial_ids = deepcopy(scenario.vehicles.ids)

        # store the initial state of the vehicles kernel (needed for restarting
        # the simulation)
        self.k.vehicle.kernel_api = None
        self.k.vehicle.master_kernel = None
        self.initial_vehicles = deepcopy(self.k.vehicle)
        self.k.vehicle.kernel_api = self.k.kernel_api
        self.k.vehicle.master_kernel = self.k

        self.setup_initial_state()

        # use pyglet to render the simulation
        if self.sim_params.render in ['gray', 'dgray', 'rgb', 'drgb']:
            save_render = self.sim_params.save_render
            sight_radius = self.sim_params.sight_radius
            pxpm = self.sim_params.pxpm
            show_radius = self.sim_params.show_radius

            # get network polygons
            network = []
            # FIXME: add to scenario kernel instead of hack
            for lane_id in self.k.kernel_api.lane.getIDList():
                _lane_poly = self.k.kernel_api.lane.getShape(lane_id)
                lane_poly = [i for pt in _lane_poly for i in pt]
                network.append(lane_poly)

            # instantiate a pyglet renderer
            self.renderer = Renderer(
                network,
                self.sim_params.render,
                save_render,
                sight_radius=sight_radius,
                pxpm=pxpm,
                show_radius=show_radius)

            # render a frame
            self.render(reset=True)
        elif self.sim_params.render in [True, False]:
            pass  # default to sumo-gui (if True) or sumo (if False)
        else:
            raise FatalFlowError(
                'Mode %s is not supported!' % self.sim_params.render)
        atexit.register(self.terminate)
Пример #51
0
    def __init__(self, name, generator_class, vehicles, net_params,
                 initial_config=InitialConfig()):
        """
        Abstract base class. Initializes a new scenario. This class can be
        instantiated once and reused in multiple experiments. Note that this
        function stores all the relevant parameters. The generate() function
        still needs to be called separately.

        Attributes
        ----------
        name: str
            A tag associated with the scenario
        generator_class: Generator type
            Class for generating configuration and net files with placed
            vehicles, e.g. CircleGenerator
        vehicles: Vehicles type
            see flow/core/vehicles.py
        net_params: NetParams type
            see flow/core/params.py
        initial_config: InitialConfig type
            see flow/core/params.py

        Raises
        ------
        ValueError
            If no "length" is provided in net_params
        """
        Serializable.quick_init(self, locals())

        self.name = name
        self.generator_class = generator_class
        self.vehicles = vehicles
        self.net_params = net_params
        self.initial_config = initial_config

        # parameters to be specified under each unique subclass's
        # __init__() function
        self.edgestarts = self.specify_edge_starts()

        # these optional parameters need only be used if "no-internal-links"
        # is set to "false" while calling sumo's netconvert function
        self.internal_edgestarts = self.specify_internal_edge_starts()
        self.intersection_edgestarts = self.specify_intersection_edge_starts()

        # in case the user did not write the intersection edge-starts in
        # internal edge-starts as well (because of redundancy), merge the two
        # together
        self.internal_edgestarts += self.intersection_edgestarts
        seen = set()
        self.internal_edgestarts = \
            [item for item in self.internal_edgestarts
             if item[1] not in seen and not seen.add(item[1])]

        # total_edgestarts and total_edgestarts_dict contain all of the above
        # edges, with the former being ordered by position
        if self.net_params.no_internal_links:
            self.total_edgestarts = self.edgestarts
        else:
            self.total_edgestarts = self.edgestarts + self.internal_edgestarts
        self.total_edgestarts.sort(key=lambda tup: tup[1])

        self.total_edgestarts_dict = dict(self.total_edgestarts)

        # length of the network, or the portion of the network in which cars are
        # meant to be distributed (to be calculated during subclass __init__(),
        # or specified in net_params)
        if not hasattr(self, "length"):
            if "length" in self.net_params.additional_params:
                self.length = self.net_params.additional_params["length"]
            else:
                raise ValueError("The network does not have a specified length.")

        # generate starting position for vehicles in the network
        if self.initial_config.positions is None:
            self.initial_config.positions, self.initial_config.lanes = \
                self.generate_starting_positions()

        self.cfg = self.generate()
Пример #52
0
 def __getstate__(self):
     d = Serializable.__getstate__(self)
     global load_params
     if load_params:
         d["params"] = self.get_param_values(all_params=True)
     return d
Пример #53
0
 def __init__(self, goal=None, *args, **kwargs):
     self._goal_vel = goal
     super(HalfCheetahEnvRand, self).__init__(*args, **kwargs)
     Serializable.__init__(self, *args, **kwargs)
Пример #54
0
    def __init__(
            self,
            base_kwargs,

            env,
            policy,
            initial_exploration_policy,
            qf1,
            qf2,
            vf,
            pool,
            plotter=None,

            lr=3e-3,
            scale_reward=1,
            scale_entropy=1,
            discount=0.99,
            tau=0.01,
            target_update_interval=1,
            action_prior='uniform',
            reparameterize=False,

            save_full_state=False,
    ):
        """
        Args:
            base_kwargs (dict): dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.

            env (`rllab.Env`): rllab environment object.
            policy: (`rllab.NNPolicy`): A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.

            qf1 (`valuefunction`): First Q-function approximator.
            qf2 (`valuefunction`): Second Q-function approximator. Usage of two
                Q-functions improves performance by reducing overestimation
                bias.
            vf (`ValueFunction`): Soft value function approximator.

            pool (`PoolBase`): Replay buffer to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.

            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.

            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise.
            save_full_state (`bool`): If True, save the full class in the
                snapshot. See `self.get_snapshot` for more information.
        """

        Serializable.quick_init(self, locals())
        super(SAC, self).__init__(**base_kwargs)

        self._env = env
        self._policy = policy
        self._initial_exploration_policy = initial_exploration_policy
        self._qf1 = qf1
        self._qf2 = qf2
        self._vf = vf
        self._pool = pool
        self._plotter = plotter

        self._policy_lr = lr
        self._qf_lr = lr
        self._vf_lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        # Reparameterize parameter must match between the algorithm and the
        # policy actions are sampled from.
        assert reparameterize == self._policy._reparameterize
        self._reparameterize = reparameterize

        self._save_full_state = save_full_state

        self._Da = self._env.action_space.flat_dim
        self._Do = self._env.observation_space.flat_dim

        self._training_ops = list()

        self._init_placeholders()
        self._init_actor_update()
        self._init_critic_update()
        self._init_target_ops()

        # Initialize all uninitialized variables. This prevents initializing
        # pre-trained policy and qf and vf variables.
        uninit_vars = []
        for var in tf.global_variables():
            try:
                self._sess.run(var)
            except tf.errors.FailedPreconditionError:
                uninit_vars.append(var)
        self._sess.run(tf.variables_initializer(uninit_vars))
Пример #55
0
 def __init__(self, *args, **kwargs):
     super(AntEnv, self).__init__(*args, **kwargs)
     Serializable.__init__(self, *args, **kwargs)
Пример #56
0
    def __init__(
        self,
        name,
        input_shape,
        output_dim,
        hidden_sizes,
        conv_filters,
        conv_filter_sizes,
        conv_strides,
        conv_pads,
        hidden_nonlinearity=NL.rectify,
        mean_network=None,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        subsample_factor=1.0,
        batchsize=None,
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_conv_filters=[],
        std_conv_filters_sizes=[],
        std_conv_strides=[],
        std_conv_pads=[],
        std_hidden_sizes=(32, 32),
        std_nonlinearity=None,
        normalize_inputs=True,
        normalize_outputs=True,
    ):
        """
        :param input_shape: usually for images of the form (width,height,channel)
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer("optimizer")
            else:
                optimizer = LbfgsOptimizer("optimizer")

        self._optimizer = optimizer

        self.input_shape = input_shape
        if mean_network is None:
            mean_network = ConvNetwork(
                name="mean_network",
                input_shape=input_shape,
                output_dim=output_dim,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = ConvNetwork(
                name="log_std_network",
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                conv_filters=std_conv_filters,
                conv_filter_sizes=std_conv_filter_sizes,
                conv_strides=std_conv_strides,
                conv_pads=std_conv_pads,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(
            np.zeros((1, np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_mean",
            broadcastable=(True, False),
        )
        x_std_var = theano.shared(
            np.ones((1, np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_std",
            broadcastable=(True, False),
        )
        y_mean_var = theano.shared(np.zeros((1, output_dim),
                                            dtype=theano.config.floatX),
                                   name="y_mean",
                                   broadcastable=(True, False))
        y_std_var = theano.shared(np.ones((1, output_dim),
                                          dtype=theano.config.floatX),
                                  name="y_std",
                                  broadcastable=(True, False))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian(output_dim)

        normalized_dist_info_vars = dict(mean=normalized_means_var,
                                         log_std=normalized_log_stds_var)

        mean_kl = TT.mean(
            dist.kl_sym(
                dict(mean=normalized_old_means_var,
                     log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

        loss = - \
            TT.mean(dist.log_likelihood_sym(
                normalized_ys_var, normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var
            ]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._mean_network = mean_network
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
        self._subsample_factor = subsample_factor
        self._batchsize = batchsize
Пример #57
0
    def __init__(self,
                 name,
                 input_shape,
                 extra_input_shape,
                 output_dim,
                 hidden_sizes,
                 conv_filters,
                 conv_filter_sizes,
                 conv_strides,
                 conv_pads,
                 extra_hidden_sizes=None,
                 hidden_W_init=L.XavierUniformInitializer(),
                 hidden_b_init=tf.zeros_initializer(),
                 output_W_init=L.XavierUniformInitializer(),
                 output_b_init=tf.zeros_initializer(),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=None,
                 input_var=None,
                 input_layer=None):
        Serializable.quick_init(self, locals())

        if extra_hidden_sizes is None:
            extra_hidden_sizes = []

        with tf.variable_scope(name):

            input_flat_dim = np.prod(input_shape)
            extra_input_flat_dim = np.prod(extra_input_shape)
            total_input_flat_dim = input_flat_dim + extra_input_flat_dim

            if input_layer is None:
                l_in = L.InputLayer(shape=(None, total_input_flat_dim),
                                    input_var=input_var,
                                    name="input")
            else:
                l_in = input_layer

            l_conv_in = L.reshape(L.SliceLayer(l_in,
                                               indices=slice(input_flat_dim),
                                               name="conv_slice"),
                                  ([0], ) + input_shape,
                                  name="conv_reshaped")
            l_extra_in = L.reshape(L.SliceLayer(l_in,
                                                indices=slice(
                                                    input_flat_dim, None),
                                                name="extra_slice"),
                                   ([0], ) + extra_input_shape,
                                   name="extra_reshaped")

            l_conv_hid = l_conv_in
            for idx, conv_filter, filter_size, stride, pad in zip(
                    range(len(conv_filters)),
                    conv_filters,
                    conv_filter_sizes,
                    conv_strides,
                    conv_pads,
            ):
                l_conv_hid = L.Conv2DLayer(
                    l_conv_hid,
                    num_filters=conv_filter,
                    filter_size=filter_size,
                    stride=(stride, stride),
                    pad=pad,
                    nonlinearity=hidden_nonlinearity,
                    name="conv_hidden_%d" % idx,
                )

            l_extra_hid = l_extra_in
            for idx, hidden_size in enumerate(extra_hidden_sizes):
                l_extra_hid = L.DenseLayer(
                    l_extra_hid,
                    num_units=hidden_size,
                    nonlinearity=hidden_nonlinearity,
                    name="extra_hidden_%d" % idx,
                    W=hidden_W_init,
                    b=hidden_b_init,
                )

            l_joint_hid = L.concat(
                [L.flatten(l_conv_hid, name="conv_hidden_flat"), l_extra_hid],
                name="joint_hidden")

            for idx, hidden_size in enumerate(hidden_sizes):
                l_joint_hid = L.DenseLayer(
                    l_joint_hid,
                    num_units=hidden_size,
                    nonlinearity=hidden_nonlinearity,
                    name="joint_hidden_%d" % idx,
                    W=hidden_W_init,
                    b=hidden_b_init,
                )
            l_out = L.DenseLayer(
                l_joint_hid,
                num_units=output_dim,
                nonlinearity=output_nonlinearity,
                name="output",
                W=output_W_init,
                b=output_b_init,
            )
            self._l_in = l_in
            self._l_out = l_out

            LayersPowered.__init__(self, [l_out], input_layers=[l_in])
Пример #58
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            dist_cls=DiagonalGaussian,
            is_protagonist=True,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        if is_protagonist==True: cur_action_space = env_spec.pro_action_space;
        else: cur_action_space = env_spec.adv_action_space

        assert isinstance(cur_action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = cur_action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim,),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim,),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Пример #59
0
    def __init__(self,
                 n_apples=8,
                 n_bombs=8,
                 activity_range=6.,
                 robot_object_spacing=2.,
                 catch_range=1.,
                 n_bins=10,
                 sensor_range=6.,
                 sensor_span=math.pi,
                 coef_inner_rew=0.,
                 dying_cost=-10,
                 *args,
                 **kwargs):
        Serializable.quick_init(self, locals())
        self.n_apples = n_apples
        self.n_bombs = n_bombs
        self.activity_range = activity_range
        self.robot_object_spacing = robot_object_spacing
        self.catch_range = catch_range
        self.n_bins = n_bins
        self.sensor_range = sensor_range
        self.sensor_span = sensor_span
        self.coef_inner_rew = coef_inner_rew
        self.dying_cost = dying_cost
        self.objects = []
        self.viewer = None

        # for openai baseline
        self.reward_range = (-float('inf'), float('inf'))
        self.metadata = None
        # super(GatherEnv, self).__init__(*args, **kwargs)
        model_cls = self.__class__.MODEL_CLASS
        if model_cls is None:
            raise "MODEL_CLASS unspecified!"
        xml_path = osp.join(MODEL_DIR, model_cls.FILE)
        tree = ET.parse(xml_path)
        worldbody = tree.find(".//worldbody")
        attrs = dict(type="box",
                     conaffinity="1",
                     rgba="0.8 0.9 0.8 1",
                     condim="3")
        walldist = self.activity_range + 1
        ET.SubElement(
            worldbody, "geom",
            dict(attrs,
                 name="wall1",
                 pos="0 -%d 0" % walldist,
                 size="%d.5 0.5 1" % walldist))
        ET.SubElement(
            worldbody, "geom",
            dict(attrs,
                 name="wall2",
                 pos="0 %d 0" % walldist,
                 size="%d.5 0.5 1" % walldist))
        ET.SubElement(
            worldbody, "geom",
            dict(attrs,
                 name="wall3",
                 pos="-%d 0 0" % walldist,
                 size="0.5 %d.5 1" % walldist))
        ET.SubElement(
            worldbody, "geom",
            dict(attrs,
                 name="wall4",
                 pos="%d 0 0" % walldist,
                 size="0.5 %d.5 1" % walldist))
        # _, file_path = tempfile.mkstemp(text=True) #todo: note that this is different from snn4hrl default
        if 'param_name' in kwargs.keys(
        ):  # added because of ec2 empty xml issue
            file_path = osp.join(
                config.PROJECT_PATH,
                "sandbox/snn4hrl/envs/mujoco/gather/mujoco_models/" +
                model_cls.FILE.split(".")[0] + "_" + kwargs['param_name'] +
                "_gather.xml")
        else:
            file_path = osp.join(
                config.PROJECT_PATH,
                "sandbox/snn4hrl/envs/mujoco/gather/mujoco_models/" +
                model_cls.FILE.split(".")[0] + "_gather.xml")
        if not osp.exists(file_path):  # create file if not there
            with open(file_path, 'w+'):
                pass
        tree.write(file_path)
        # pylint: disable=not-callable
        inner_env = model_cls(
            *args, file_path=file_path,
            **kwargs)  # giving problems because of this weird tempfile
        # pylint: enable=not-callable
        ProxyEnv.__init__(
            self, inner_env)  # to access the inner env, do self.wrapped_env

        # optimization, caching obs spaces
        ub = BIG * np.ones(self.get_current_obs().shape)
        self.obs_space = spaces.Box(ub * -1, ub)
        ub = BIG * np.ones(self.get_current_robot_obs().shape)
        self.robot_obs_space = spaces.Box(ub * -1, ub)
        ub = BIG * np.ones(np.concatenate(self.get_readings()).shape)
        self.maze_obs_space = spaces.Box(ub * -1, ub)
 def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     self._goal_vel = None
     super(SwimmerRandGoalOracleEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())