def __init__(self, desc='two-state', map_id=None): self._map_id = map_id Serializable.quick_init(self, locals()) if isinstance(desc, str): desc = MAPS[desc] self.desc_choices = desc self.reset()
def __init__(self, *args, **kwargs): """ Constants: omega is always 0, and set a constant for background noise """ self.Om = np.array([[1,0,0],[0,1,0],[0,0,1]]) self.background = 2.0 #background noise """ Variables: These variables will be read along with the action: two_theta: detector's rotation about the z-axis -- assume elastic scattering, so omega is always 0 theta: the angle at which our neutrons strike the plane These variables are the two dimensions of our problem chi: outer ring's rotation about the x-axis phi: rotation of the eulerian cradle, varies between z- and y-axis rotation depending on how much chi rotated """ self.max_two_theta = 180 self.max_chi = 90 self.max_phi = 360 self.min_chi = -90 self.min_phi = 0 self.hit = 0 #Set up hkl and all actions super(UBEnv, self).__init__(self.model_path("UB.xml.mako"),*args, **kwargs) #Two independent bodies self.ring = find_body(self.world, "ring") #chi self.eu_cradle = find_body(self.world, "eu_cradle") #phi self.detector = find_body(self.world, "detector") #theta self.pivot = find_joint(self.world, "angular_axis") #pivot that enables angular movement Serializable.__init__(self, *args, **kwargs)
def __init__( self, epsilon=0.5, L2_reg_dual=0., # 1e-5, L2_reg_loss=0., max_opt_itr=50, optimizer=scipy.optimize.fmin_l_bfgs_b, **kwargs): """ :param epsilon: Max KL divergence between new policy and old policy. :param L2_reg_dual: Dual regularization :param L2_reg_loss: Loss regularization :param max_opt_itr: Maximum number of batch optimization iterations. :param optimizer: Module path to the optimizer. It must support the same interface as scipy.optimize.fmin_l_bfgs_b. :return: """ Serializable.quick_init(self, locals()) super(REPS, self).__init__(**kwargs) self.epsilon = epsilon self.L2_reg_dual = L2_reg_dual self.L2_reg_loss = L2_reg_loss self.max_opt_itr = max_opt_itr self.optimizer = optimizer self.opt_info = None
def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__(self, env, obs_noise=1e-1, ): super(NoisyObservationEnv, self).__init__(env) Serializable.quick_init(self, locals()) self.obs_noise = obs_noise
def __init__(self, env, ma_mode): Serializable.quick_init(self, locals()) self.env = env if hasattr(env, 'id'): self.env_id = env.id else: self.env_id = 'MA-Wrapper-v0' if ma_mode == 'centralized': obsfeat_space = convert_gym_space(env.agents[0].observation_space, n_agents=len(env.agents)) action_space = convert_gym_space(env.agents[0].action_space, n_agents=len(env.agents)) elif ma_mode in ['decentralized', 'concurrent']: obsfeat_space = convert_gym_space(env.agents[0].observation_space, n_agents=1) action_space = convert_gym_space(env.agents[0].action_space, n_agents=1) else: raise NotImplementedError self._observation_space = obsfeat_space self._action_space = action_space if hasattr(env, 'timestep_limit'): self._horizon = env.timestep_limit else: self._horizon = 250
def __init__(self, obj, method_name, args, kwargs): self._serializable_initialized = False Serializable.quick_init(self, locals()) self.obj = obj self.method_name = method_name self.args = args self.kwargs = kwargs
def __init__(self, goal_vel=None, *args, **kwargs): self.goal_vel = goal_vel super(HalfCheetahEnvRandDirec, self).__init__(*args, **kwargs) self.goal_vel = goal_vel Serializable.__init__(self, *args, **kwargs) self.goal_vel = goal_vel self.reset(reset_args=goal_vel)
def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True, force_reset=False): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) self.env = env self.env_id = env.spec.id monitor_manager.logger.setLevel(logging.WARNING) assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) self._action_space = convert_gym_space(env.action_space) self._horizon = env.spec.timestep_limit self._log_dir = log_dir self._force_reset = force_reset
def __init__( self, env, policy, n_itr=500, max_path_length=500, discount=0.99, sigma0=1., batch_size=None, plot=False, **kwargs ): """ :param n_itr: Number of iterations. :param max_path_length: Maximum length of a single rollout. :param batch_size: # of samples from trajs from param distribution, when this is set, n_samples is ignored :param discount: Discount. :param plot: Plot evaluation run after each iteration. :param sigma0: Initial std for param dist :return: """ Serializable.quick_init(self, locals()) self.env = env self.policy = policy self.plot = plot self.sigma0 = sigma0 self.discount = discount self.max_path_length = max_path_length self.n_itr = n_itr self.batch_size = batch_size
def __init__(self, name, max_opt_itr=20, callback=None): Serializable.quick_init(self, locals()) self._name = name self._max_opt_itr = max_opt_itr self._opt_fun = None self._target = None self._callback = callback
def __init__( self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff super(SwimmerEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__( self, cg_iters=10, reg_coeff=1e-5, subsample_factor=0.1, backtrack_ratio=0.8, max_backtracks=15, debug_nan=False ): """ :param cg_iters: The number of CG iterations used to calculate A^-1 g :param reg_coeff: A small value so that A -> A + reg*I :param subsample_factor: Subsampling factor to reduce samples when using "conjugate gradient. Since the computation time for the descent direction dominates, this can greatly reduce the overall computation time. :param debug_nan: if set to True, NanGuard will be added to the compilation, and ipdb will be invoked when nan is detected :return: """ Serializable.quick_init(self, locals()) self._cg_iters = cg_iters self._reg_coeff = reg_coeff self._subsample_factor = subsample_factor self._backtrack_ratio = backtrack_ratio self._max_backtracks = max_backtracks self._opt_fun = None self._target = None self._max_constraint_val = None self._constraint_name = None self._debug_nan = debug_nan
def __init__( self, update_method=lasagne.updates.adam, learning_rate=1e-3, max_epochs=1000, tolerance=1e-6, batch_size=32, callback=None, verbose=False, **kwargs): """ :param max_epochs: :param tolerance: :param update_method: :param batch_size: None or an integer. If None the whole dataset will be used. :param callback: :param kwargs: :return: """ Serializable.quick_init(self, locals()) self._opt_fun = None self._target = None self._callback = callback update_method = partial(update_method, learning_rate=learning_rate) self._update_method = update_method self._max_epochs = max_epochs self._tolerance = tolerance self._batch_size = batch_size self._verbose = verbose
def __init__(self, regressors): """ :param regressors: List of individual regressors """ Serializable.quick_init(self, locals()) self.regressors = regressors self.output_dims = [x.output_dim for x in regressors]
def __init__( self, name, max_opt_itr=20, initial_penalty=1.0, min_penalty=1e-2, max_penalty=1e6, increase_penalty_factor=2, decrease_penalty_factor=0.5, max_penalty_itr=10, adapt_penalty=True): Serializable.quick_init(self, locals()) self._name = name self._max_opt_itr = max_opt_itr self._penalty = initial_penalty self._initial_penalty = initial_penalty self._min_penalty = min_penalty self._max_penalty = max_penalty self._increase_penalty_factor = increase_penalty_factor self._decrease_penalty_factor = decrease_penalty_factor self._max_penalty_itr = max_penalty_itr self._adapt_penalty = adapt_penalty self._opt_fun = None self._target = None self._max_constraint_val = None self._constraint_name = None
def __init__(self, mdp_cls, mdp_args): Serializable.quick_init(self, locals()) self.mdp_cls = mdp_cls self.mdp_args = dict(mdp_args) self.mdp_args["template_args"] = dict(noise=True) mdp = self.gen_mdp() super(IdentificationEnv, self).__init__(mdp)
def __init__( self, observation_space, action_space): Serializable.quick_init(self, locals()) self._observation_space = observation_space self._action_space = action_space
def __init__(self, goal_reward=10, actuation_cost_coeff=30, distance_cost_coeff=1, init_sigma=0.1): super().__init__() Serializable.quick_init(self, locals()) self.dynamics = PointDynamics(dim=2, sigma=0) self.init_mu = np.zeros(2, dtype=np.float32) self.init_sigma = init_sigma self.goal_positions = np.array( [ [5, 0], [-5, 0], [0, 5], [0, -5] ], dtype=np.float32 ) self.goal_threshold = 1. self.goal_reward = goal_reward self.action_cost_coeff = actuation_cost_coeff self.distance_cost_coeff = distance_cost_coeff self.xlim = (-7, 7) self.ylim = (-7, 7) self.vel_bound = 1. self.reset() self.observation = None self._ax = None self._env_lines = [] self.fixed_plots = None self.dynamic_plots = []
def __init__(self, env_spec, obs_pl, action, scope_name=None): Serializable.quick_init(self, locals()) self._obs_pl = obs_pl self._action = action self._scope_name = (tf.get_variable_scope().name if not scope_name else scope_name) super(NNPolicy, self).__init__(env_spec)
def __init__(self, env_spec, max_sigma=1.0, min_sigma=0.1, decay_period=1000000): assert isinstance(env_spec.action_space, Box) assert len(env_spec.action_space.shape) == 1 Serializable.quick_init(self, locals()) self._max_sigma = max_sigma self._min_sigma = min_sigma self._decay_period = decay_period self._action_space = env_spec.action_space
def __setstate__(self, state): """Set Serializable state fo the RLAlgorithm instance.""" Serializable.__setstate__(self, state) self.qf.set_param_values(state['qf-params']) self.policy.set_param_values(state['policy-params']) self.pool.__setstate__(state['pool']) self.env.__setstate__(state['env'])
def __init__( self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff self._goal_vel = None super(SwimmerRandGoalEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__(self, *inputs, name, hidden_layer_sizes): Parameterized.__init__(self) Serializable.quick_init(self, locals()) self._name = name self._inputs = inputs self._layer_sizes = list(hidden_layer_sizes) + [1] self._output = self._output_for(*self._inputs)
def __init__( self, alive_coeff=1, ctrl_cost_coeff=0.01, *args, **kwargs): self.alive_coeff = alive_coeff self.ctrl_cost_coeff = ctrl_cost_coeff super(HopperEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__(self, env, action_delay=3, ): assert action_delay > 0, "Should not use this env transformer" super(DelayedActionEnv, self).__init__(env) Serializable.quick_init(self, locals()) self.action_delay = action_delay self._queued_actions = None
def __init__(self, max_opt_itr=20, batch_size=32, cg_batch_size=100, callback=None): Serializable.quick_init(self, locals()) self._max_opt_itr = max_opt_itr self._opt_fun = None self._target = None self._batch_size = batch_size self._cg_batch_size = cg_batch_size self._hf_optimizer = None self._callback = callback
def __init__(self, desc_str='4x4', max_traj_length=10, goal_reward=10.0): Serializable.quick_init(self, locals()) self.desc_str = desc_str # Map will be loaded in `self.reset` self.max_traj_length = max_traj_length self.n_row, self.n_col = np.array(map(list, self._fetch_map())).shape self.state = None self.goal_reward = goal_reward
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, action_merge_layer=-2, output_nonlinearity=None, bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer( l_hidden, num_units=size, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1) ) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer( l_hidden, num_units=1, nonlinearity=output_nonlinearity, name="output" ) output_var = L.get_output(l_output, deterministic=True) self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity LayersPowered.__init__(self, [l_output])
def __init__(self, *args, **kwargs): super(CartpoleSwingupEnvX, self).__init__( self.model_path("cartpole.xml.mako"), *args, **kwargs ) self.max_cart_pos = 3 self.max_reward_cart_pos = 3 self.cart = find_body(self.world, "cart") self.pole = find_body(self.world, "pole") Serializable.__init__(self, *args, **kwargs)
def __init__( self, # goal_generator, n_bins=20, sensor_range=10., sensor_span=math.pi, maze_id=0, length=1, maze_height=0.5, maze_size_scaling=2, coef_inner_rew=1., # a coef of 0 gives no reward to the maze from the wrapped env. # goal_rew=1., # reward obtained when reaching the goal include_maze_obs=False, *args, **kwargs): Serializable.quick_init(self, locals()) self._n_bins = n_bins self._sensor_range = sensor_range self._sensor_span = sensor_span self._maze_id = maze_id self.length = length self.coef_inner_rew = coef_inner_rew # self.goal_rew = goal_rew self.include_maze_obs = include_maze_obs model_cls = self.__class__.MODEL_CLASS if model_cls is None: raise "MODEL_CLASS unspecified!" xml_path = osp.join(MODEL_DIR, model_cls.FILE) tree = ET.parse(xml_path) worldbody = tree.find(".//worldbody") self.MAZE_HEIGHT = height = maze_height self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling self.MAZE_STRUCTURE = structure = construct_maze(maze_id=self._maze_id, length=self.length) if self._maze_id == 0: self.LINEARIZED = MazeEnv.MAZE_0 elif self._maze_id == 11: self.LINEARIZED = MazeEnv.MAZE_11 elif self._maze_id == 13: self.LINEARIZED = MazeEnv.MAZE_13 elif self._maze_id == 14: self.LINEARIZED = MazeEnv.MAZE_14 else: self.LINEARIZED = None torso_x, torso_y = self._find_robot() self._init_torso_x = torso_x self._init_torso_y = torso_y for i in range(len(structure)): for j in range(len(structure[0])): if str(structure[i][j]) == '1': # offset all coordinates so that robot starts at the origin ET.SubElement( worldbody, "geom", name="block_%d_%d" % (i, j), pos="%f %f %f" % (j * size_scaling - torso_x, i * size_scaling - torso_y, height / 2 * size_scaling), size="%f %f %f" % (0.5 * size_scaling, 0.5 * size_scaling, height / 2 * size_scaling), type="box", material="", contype="1", conaffinity="1", rgba="0.4 0.4 0.4 0.5") torso = tree.find(".//body[@name='torso']") geoms = torso.findall(".//geom") for geom in geoms: if 'name' not in geom.attrib: raise Exception("Every geom of the torso must have a name " "defined") segments = [] # Get all line segments of the goal and the obstacles for i in range(len(structure)): for j in range(len(structure[0])): if structure[i][j] == 1 or structure[i][j] == 'g': cx = j * size_scaling - self._init_torso_x cy = i * size_scaling - self._init_torso_y x1 = cx - 0.5 * size_scaling x2 = cx + 0.5 * size_scaling y1 = cy - 0.5 * size_scaling y2 = cy + 0.5 * size_scaling struct_segments = [ ((x1, y1), (x2, y1)), ((x2, y1), (x2, y2)), ((x2, y2), (x1, y2)), ((x1, y2), (x1, y1)), ] for seg in struct_segments: segments.append( dict( segment=seg, type=structure[i][j], )) self.segments = segments if self.__class__.MAZE_MAKE_CONTACTS: contact = ET.SubElement(tree.find("."), "contact") for i in range(len(structure)): for j in range(len(structure[0])): if str(structure[i][j]) == '1': for geom in geoms: ET.SubElement(contact, "pair", geom1=geom.attrib["name"], geom2="block_%d_%d" % (i, j)) _, file_path = tempfile.mkstemp(text=True) tree.write( file_path ) # here we write a temporal file with the robot specifications. Why not the original one?? self._goal_range = self._find_goal_range() self._cached_segments = None inner_env = model_cls(file_path=file_path, *args, **kwargs) # file to the robot specifications ProxyEnv.__init__( self, inner_env) # here is where the robot env will be initialized
def __init__(self, env_spec, name='qnet', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, action_merge_layer=-2, output_nonlinearity=None, hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(), output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(), bn=False): Serializable.quick_init(self, locals()) with tf.variable_scope(name): l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = L.batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer(l_hidden, num_units=size, W=hidden_W_init, b=hidden_b_init, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1)) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer(l_hidden, num_units=1, W=output_W_init, b=output_b_init, nonlinearity=output_nonlinearity, name="output") #output_var = L.get_output(l_output, deterministic=True).flatten() output_var = tf.reshape(L.get_output(l_output, deterministic=True), (-1, )) self._f_qval = tensor_utils.compile_function( [l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity LayersPowered.__init__(self, [l_output])
def __init__( self, name, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer if prob_network is None: prob_network = MLP(input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network") l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable(name="x_mean", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 0., dtype=tf.float32)) x_std_var = tf.get_variable(name="x_std", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 1., dtype=tf.float32)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output( l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var, old_prob_var], leq_constraint=(mean_kl, step_size)) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region
def __init__(self, env_spec, hidden_sizes=(32, ), state_include_action=True, hidden_nonlinearity=NL.tanh, learn_std=True, init_std=1.0, output_nonlinearity=None, **kwargs): """ :param env_spec: A spec for the env. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ Serializable.quick_init(self, locals()) super(GaussianRNNPolicy, self).__init__(env_spec) assert len(hidden_sizes) == 1 if state_include_action: obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim else: obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim self.n_hidden = hidden_sizes[0] mean_network = self.create_mean_network( input_shape=(obs_dim, ), output_dim=action_dim, hidden_dim=hidden_sizes[0], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, **kwargs) l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) l_step_log_std = ParamLayer( mean_network.step_input_layer, num_units=action_dim, param=l_log_std.param, name="step_output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._l_log_std = l_log_std self._state_include_action = state_include_action self._f_step_mean_std = ext.compile_function( [ mean_network.step_input_layer.input_var, mean_network.step_prev_hidden_layer.input_var ], L.get_output([ mean_network.step_output_layer, l_step_log_std, mean_network.step_hidden_layer ])) self._prev_action = None self._prev_hidden = None self._hidden_sizes = hidden_sizes self._dist = RecurrentDiagonalGaussian(action_dim) self.reset() self.greedy = False LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
def __init__(self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, mean_network=None, std_network=None, std_parametrization='exp'): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) # assert isinstance(env_spec.action_space, Box) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( name="mean_network", input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_std_param = std_network.output_layer else: if adaptive_std: std_network = MLP( name="std_network", input_shape=(obs_dim, ), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_std_param = std_network.output_layer else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError l_std_param = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param # mean_var, log_std_var = L.get_output([l_mean, l_std_param]) # # if self.min_std_param is not None: # log_std_var = tf.maximum(log_std_var, np.log(min_std)) # # self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_std_param = l_std_param self._dist = DiagonalGaussian(action_dim) LayersPowered.__init__(self, [l_mean, l_std_param]) super(GaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym( mean_network.input_layer.input_var, dict()) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._f_dist = tensor_utils.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__(self, goal_args=('noisy', (.6,.2), .1), frame_skip=5, *args, **kwargs): self.goal_args = goal_args super(PickerEnv, self).__init__(frame_skip=frame_skip, *args, **kwargs) Serializable.__init__(self, goal_args, frame_skip, *args, **kwargs)
def __init__(self, ip='127.0.0.1', port=9397): self._conn = ZMQConnection(ip, port) self.prev_action = 0. Serializable.quick_init(self, locals())
def __init__( self, name, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=tf.tanh, gru_layer_cls=L.GRULayer, learn_std=True, init_std=1.0, output_nonlinearity=None, ): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ with tf.variable_scope(name): Serializable.quick_init(self, locals()) super(GaussianGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer( shape=(None, None, input_dim), name="input" ) if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = L.OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: tf.reshape( flat_feature, tf.pack( [tf.shape(input)[0], tf.shape(input)[1], feature_dim]) ), shape_op=lambda _, input_shape: ( input_shape[0], input_shape[1], feature_dim) ) mean_network = GRUNetwork( input_shape=(feature_dim,), input_layer=l_feature, output_dim=action_dim, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, gru_layer_cls=gru_layer_cls, name="mean_network" ) l_log_std = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(np.log(init_std)), name="output_log_std", trainable=learn_std, ) l_step_log_std = L.ParamLayer( mean_network.step_input_layer, num_units=action_dim, param=l_log_std.param, name="step_output_log_std", trainable=learn_std, ) self.mean_network = mean_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = tf.placeholder( dtype=tf.float32, shape=(None, input_dim), name="flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_mean_std = tensor_utils.compile_function( [ flat_input_var, # mean_network.step_prev_hidden_layer.input_var, mean_network.step_prev_state_layer.input_var ], L.get_output([ mean_network.step_output_layer, l_step_log_std, mean_network.step_hidden_layer, ], {mean_network.step_input_layer: feature_var}) ) self.l_log_std = l_log_std self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_actions = None self.prev_hiddens = None self.dist = RecurrentDiagonalGaussian(action_dim) out_layers = [mean_network.output_layer, l_log_std, l_step_log_std] if feature_network is not None: out_layers.append(feature_network.output_layer) LayersPowered.__init__(self, out_layers)
def __init__(self, env_spec, name='qnet', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, action_merge_layer=-2, output_nonlinearity=None, eqf_use_full_qf=False, eqf_sample_size=1, bn=False): Serializable.quick_init(self, locals()) assert not env_spec.action_space.is_discrete self._env_spec = env_spec with tf.variable_scope(name): l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer(l_hidden, num_units=size, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1)) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer(l_hidden, num_units=1, nonlinearity=output_nonlinearity, name="output") output_var = L.get_output(l_output, deterministic=True) output_var = tf.reshape(output_var, (-1, )) self._f_qval = tensor_utils.compile_function( [l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity #This is not True according most common cases like vpg self.eqf_use_full_qf = eqf_use_full_qf self.eqf_sample_size = eqf_sample_size LayersPowered.__init__(self, [l_output])
def __init__(self, name, input_shape, output_dim, mean_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, optimizer=None, use_trust_region=True, step_size=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, subsample_factor=1.0): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer("optimizer") else: optimizer = LbfgsOptimizer("optimizer") self._optimizer = optimizer self._subsample_factor = subsample_factor if mean_network is None: # print("Debug2, mean network is defined ehre") # mean_network = L.ParamLayer( # incoming=L.InputLayer( # shape=(None,) + input_shape, # name="input_layer"), # num_units=1, # param=tf.constant_initializer(-200.0), # name="mean_network", # trainable=True, # ), # print(mean_network.input_layer) # print("debug4", isinstance(L.InputLayer( # shape=(None,) + input_shape, # name="input_layer"), tuple)) # # l_mean = mean_network mean_network = MLP( name="mean_network", input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = MLP( name="log_std_network", input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = L.ParamLayer( mean_network.input_layer, num_units=output_dim, param=tf.constant_initializer(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LayersPowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim)) old_means_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim)) old_log_stds_var = tf.placeholder(dtype=tf.float32, name="old_log_stds", shape=(None, output_dim)) x_mean_var = tf.Variable(np.zeros((1, ) + input_shape, dtype=np.float32), name="x_mean", trainable=False) x_std_var = tf.Variable(np.ones((1, ) + input_shape, dtype=np.float32), name="x_std", trainable=False) y_mean_var = tf.Variable(np.zeros((1, output_dim), dtype=np.float32), name="y_mean", trainable=False) y_std_var = tf.Variable(np.ones((1, output_dim), dtype=np.float32), name="y_std", trainable=False) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + tf.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - tf.log(y_std_var) ## code added for symbolic prediction, used in constructing the meta-learning objective def normalized_means_var_sym(xs, params): inputs = OrderedDict({mean_network.input_layer: xs}) inputs.update(params) return L.get_output(layer_or_layers=l_mean, inputs=inputs) # normalized_means_var_sym = lambda xs, params: L.get_output(layer_or_layers=l_mean, inputs=OrderedDict({mean_network.input_layer:xs}.) #mean_network.input_layer: (xs-x_mean_var)/x_std_var, # normalized_log_stds_var_sym = L.get_output(l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var_sym = lambda xs, params: normalized_means_var_sym( xs=xs, params=params) * y_std_var + y_mean_var # log_stds_var = normalized_log_stds_var + tf.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = tf.cast( tf.reduce_mean( dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )), tf.float32) # loss = - tf.cast(tf.reduce_mean(dist.log_likelihood_sym(normalized_ys_var, normalized_dist_info_vars)), tf.float32) loss = tf.cast( tf.reduce_mean( tf.square(normalized_ys_var - normalized_means_var)) + tf.reduce_mean(tf.square(normalized_log_stds_var)), tf.float32) self._f_predict = tensor_utils.compile_function([xs_var], means_var) self._f_pdists = tensor_utils.compile_function( [xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std self._f_predict_sym = means_var_sym self.loss_sym = loss optimizer_args = dict( loss=loss, target=self, network_outputs=[ normalized_means_var, normalized_log_stds_var ], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var ] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var
def __init__(self, *args, **kwargs): super(ReacherEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__(self, *args, **kwargs): super(BlkCmplxObs, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, mean_network=None, std_network=None, std_parametrization='exp', grad_step_size=1.0, stop_grad=False, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1] :param stop_grad: whether or not to stop the gradient through the gradient. :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self.n_hidden = len(hidden_sizes) self.hidden_nonlinearity = hidden_nonlinearity self.output_nonlinearity = output_nonlinearity self.input_shape = ( None, obs_dim, ) self.step_size = grad_step_size self.stop_grad = stop_grad if type(self.step_size) == list: raise NotImplementedError('removing this since it didnt work well') # create network if mean_network is None: self.all_params = self.create_MLP( # TODO: this should not be a method of the policy! --> helper name="mean_network", output_dim=self.action_dim, hidden_sizes=hidden_sizes, ) self.input_tensor, _ = self.forward_MLP( 'mean_network', self.all_params, reuse=None # Need to run this for batch norm ) forward_mean = lambda x, params, is_train: self.forward_MLP( 'mean_network', params, input_tensor=x, is_training=is_train)[1 ] else: raise NotImplementedError('Not supported.') if std_network is not None: raise NotImplementedError('Not supported.') else: if adaptive_std: raise NotImplementedError('Not supported.') else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError self.all_params['std_param'] = make_param_layer( num_units=self.action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) forward_std = lambda x, params: forward_param_layer( x, params['std_param']) self.all_param_vals = None # unify forward mean and forward std into a single function self._forward = lambda obs, params, is_train: (forward_mean( obs, params, is_train), forward_std(obs, params)) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param self._dist = DiagonalGaussian(self.action_dim) self._cached_params = {} super(MAMLGaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(self.input_tensor, dict(), is_training=False) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] # pre-update policy self._init_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=[mean_var, log_std_var], ) self._cur_f_dist = self._init_f_dist
def __init__( self, env_spec, env, pkl_paths=(), json_paths=(), npz_paths=(), trainable_old=True, external_selector=False, hidden_sizes_selector=(10, 10), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): """ :param pkl_paths: tuple/list of pkl paths :param json_paths: tuple/list of json paths :param npz_paths: tuple/list of npz paths :param trainable_old: Are the old policies still trainable :param external_selector: is the linear combination of the old policies outputs fixed externally :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std """ # define where are the old policies to use and what to do with them: self.trainable_old = trainable_old # whether to keep training the old policies loaded here self.pkl_paths = pkl_paths self.json_paths = json_paths self.npz_paths = npz_paths self.selector_dim = max( len(json_paths), len(pkl_paths)) # pkl could be zero if giving npz # if not use a selector NN here, just externally fixed selector variable: self.external_selector = external_selector # whether to use the selectorNN defined here or the pre_fix_selector self.pre_fix_selector = np.zeros( (self.selector_dim) ) # if this is not empty when using reset() it will use this selector self.selector_fix = np.zeros( (self.selector_dim )) # this will hold the selectors variable sampled in reset() self.shared_selector_var = theano.shared( self.selector_fix) # this is for external selector! update that # else, describe the MLP used: self.hidden_sizes_selector = hidden_sizes_selector # size of the selector NN defined here self.min_std = min_std self._set_std_to_0 = False self.action_dim = env_spec.action_space.flat_dim # not checking that all the old policies have this act_dim self.old_hidden_sizes = [] # assume json always given for json_path in self.json_paths: data = json.load( open(os.path.join(config.PROJECT_PATH, json_path), 'r')) old_json_policy = data['json_args']["policy"] self.old_hidden_sizes.append(old_json_policy['hidden_sizes']) # retrieve dimensions and check consistency if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) if self.external_selector: # in case we want to fix the selector externally l_all_obs_var = L.InputLayer( shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, )) all_obs_var = l_all_obs_var.input_var l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.selector_dim, param=self.shared_selector_var, trainable=False) selection_var = L.get_output(l_selection) else: # create network with softmax output: it will be the selector! selector_network = MLP( input_shape=(self.obs_robot_dim + self.obs_maze_dim, ), output_dim=self.selector_dim, hidden_sizes=self.hidden_sizes_selector, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_all_obs_var = selector_network.input_layer all_obs_var = selector_network.input_layer.input_var # collect the output to select the behavior of the robot controller (equivalent to selectors) l_selection = selector_network.output_layer selection_var = L.get_output(l_selection) # split all_obs into the robot and the maze obs --> ROBOT goes first!! l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim) l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None) obs_robot_var = all_obs_var[:, :self.obs_robot_dim] obs_maze_var = all_obs_var[:, self.obs_robot_dim:] # create the action networks self.old_l_means = [ ] # I do this self in case I wanna access it from reset self.old_l_log_stds = [] self.old_layers = [] for i in range(self.selector_dim): mean_network = MLP( input_layer=l_obs_robot, output_dim=self.action_dim, hidden_sizes=self.old_hidden_sizes[i], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP{}".format(i), ) self.old_l_means.append(mean_network.output_layer) self.old_layers += mean_network.layers l_log_std = ParamLayer( incoming=mean_network.input_layer, num_units=self.action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std{}".format(i), trainable=learn_std, ) self.old_l_log_stds.append(l_log_std) self.old_layers += [l_log_std] if not self.trainable_old: for layer in self.old_layers: for param, tags in layer.params.items( ): # params of layer are OrDict: key=the shared var, val=tags tags.remove("trainable") if self.json_paths and self.npz_paths: old_params_dict = {} for i, npz_path in enumerate(self.npz_paths): params_dict = dict( np.load(os.path.join(config.PROJECT_PATH, npz_path))) renamed_warm_params_dict = {} for key in params_dict.keys(): if key == 'output_log_std.param': old_params_dict['output_log_std{}.param'.format( i)] = params_dict[key] elif 'meanMLP_' == key[:8]: old_params_dict['meanMLP{}_'.format(i) + key[8:]] = params_dict[key] else: old_params_dict['meanMLP{}_'.format(i) + key] = params_dict[key] self.set_old_params(old_params_dict) elif self.pkl_paths: old_params_dict = {} for i, pkl_path in enumerate(self.pkl_paths): data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) params = data['policy'].get_params_internal() for param in params: if param.name == 'output_log_std.param': old_params_dict['output_log_std{}.param'.format( i)] = param.get_value() elif 'meanMLP_' == param.name[:8]: old_params_dict['meanMLP{}_'.format(i) + param.name[8:]] = param.get_value() else: old_params_dict['meanMLP{}_'.format(i) + param.name] = param.get_value() self.set_old_params(old_params_dict) # new layers actually selecting the correct output l_mean = SumProdLayer(self.old_l_means + [l_selection]) l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection]) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(self.action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec) self._f_old_means = ext.compile_function( inputs=[all_obs_var], outputs=[ L.get_output(l_old_mean) for l_old_mean in self.old_l_means ]) self._f_all_inputs = ext.compile_function( inputs=[all_obs_var], outputs=[ L.get_output(l_old_mean) for l_old_mean in self.old_l_means ] + [selection_var]) self._f_dist = ext.compile_function( inputs=[all_obs_var], outputs=[mean_var, log_std_var], ) # if I want to monitor the selector output self._f_select = ext.compile_function( inputs=[all_obs_var], outputs=selection_var, )
def __init__(self, wrapped_env, action_space, observation_space): Serializable.quick_init(self, locals()) super(SpecWrapperEnv, self).__init__(wrapped_env) self._action_space = action_space self._observation_space = observation_space
def __init__(self, num_slices=1): Serializable.quick_init(self, locals()) self.target = None self.reg_coeff = None self.opt_fun = None self._num_slices = num_slices
def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff super(Walker2DEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__(self): Serializable.quick_init(self, locals())
def __init__(self, eta=0.01, alpha=0.001, max_epochs=1, tolerance=1e-5, batch_size=32, epsilon=1e-8, verbose=False, num_slices=1, use_SGD=False, scale=1.0, backtrack_ratio=0.5, max_backtracks=10, cg_iters=10, reg_coeff=1e-5, subsample_factor=1., hvp_approach=None, max_batch=10, learning_rate=1e-3, **kwargs): """ :param max_epochs: :param tolerance: :param update_method: :param batch_size: None or an integer. If None the whole dataset will be used. :param cg_iters: The number of CG iterations used to calculate A^-1 g :param reg_coeff: A small value so that A -> A + reg*I :param subsample_factor: Subsampling factor to reduce samples when using "conjugate gradient. Since the computation time for the descent direction dominates, this can greatly reduce the overall computation time. :param kwargs: :return: """ Serializable.quick_init(self, locals()) self._eta = eta self._alpha = alpha self._opt_fun = None self._target = None self._max_epochs = max_epochs self._tolerance = tolerance self._batch_size = batch_size self._epsilon = epsilon self._verbose = verbose self._input_vars = None self._num_slices = num_slices self._scale = scale self._use_SGD = use_SGD self._backtrack_ratio = backtrack_ratio self._max_backtracks = max_backtracks self._max_batch = max_batch self._learning_rate = learning_rate self._cg_iters = cg_iters self._reg_coeff = reg_coeff self._subsample_factor = subsample_factor if hvp_approach is None: hvp_approach = PerlmutterHvp(num_slices) self._hvp_approach = hvp_approach logger.log('max_batch %d' % (self._max_batch)) logger.log('mini_batch %d' % (self._batch_size)) logger.log('cg_iters %d' % (self._cg_iters)) logger.log('subsample_factor %f' % (self._subsample_factor))
def __init__(self, env_params, sim_params, scenario, simulator='traci'): """Initialize the environment class. Parameters ---------- env_params : flow.core.params.EnvParams see flow/core/params.py sim_params : flow.core.params.SimParams see flow/core/params.py scenario : flow.scenarios.Scenario see flow/scenarios/base_scenario.py simulator : str the simulator used, one of {'traci', 'aimsun'}. Defaults to 'traci' Raises ------ flow.utils.exceptions.FatalFlowError if the render mode is not set to a valid value """ # Invoke serializable if using rllab if serializable_flag: Serializable.quick_init(self, locals()) self.env_params = env_params self.scenario = scenario self.sim_params = sim_params time_stamp = ''.join(str(time.time()).split('.')) if os.environ.get("TEST_FLAG", 0): # 1.0 works with stress_test_start 10k times time.sleep(1.0 * int(time_stamp[-6:]) / 1e6) # FIXME: this is sumo-specific self.sim_params.port = sumolib.miscutils.getFreeSocketPort() # time_counter: number of steps taken since the start of a rollout self.time_counter = 0 # step_counter: number of total steps taken self.step_counter = 0 # initial_state: # Key = Vehicle ID, # Entry = (type_id, route_id, lane_index, lane_pos, speed, pos) self.initial_state = {} self.state = None self.obs_var_labels = [] # simulation step size self.sim_step = sim_params.sim_step # the simulator used by this environment self.simulator = simulator # create the Flow kernel self.k = Kernel(simulator=self.simulator, sim_params=sim_params) # use the scenario class's network parameters to generate the necessary # scenario components within the scenario kernel self.k.scenario.generate_network(scenario) # initial the vehicles kernel using the VehicleParams object self.k.vehicle.initialize(deepcopy(scenario.vehicles)) # initialize the simulation using the simulation kernel. This will use # the scenario kernel as an input in order to determine what network # needs to be simulated. kernel_api = self.k.simulation.start_simulation( scenario=self.k.scenario, sim_params=sim_params) # pass the kernel api to the kernel and it's subclasses self.k.pass_api(kernel_api) # the available_routes variable contains a dictionary of routes # vehicles can traverse; to be used when routes need to be chosen # dynamically self.available_routes = self.k.scenario.rts # store the initial vehicle ids self.initial_ids = deepcopy(scenario.vehicles.ids) # store the initial state of the vehicles kernel (needed for restarting # the simulation) self.k.vehicle.kernel_api = None self.k.vehicle.master_kernel = None self.initial_vehicles = deepcopy(self.k.vehicle) self.k.vehicle.kernel_api = self.k.kernel_api self.k.vehicle.master_kernel = self.k self.setup_initial_state() # use pyglet to render the simulation if self.sim_params.render in ['gray', 'dgray', 'rgb', 'drgb']: save_render = self.sim_params.save_render sight_radius = self.sim_params.sight_radius pxpm = self.sim_params.pxpm show_radius = self.sim_params.show_radius # get network polygons network = [] # FIXME: add to scenario kernel instead of hack for lane_id in self.k.kernel_api.lane.getIDList(): _lane_poly = self.k.kernel_api.lane.getShape(lane_id) lane_poly = [i for pt in _lane_poly for i in pt] network.append(lane_poly) # instantiate a pyglet renderer self.renderer = Renderer( network, self.sim_params.render, save_render, sight_radius=sight_radius, pxpm=pxpm, show_radius=show_radius) # render a frame self.render(reset=True) elif self.sim_params.render in [True, False]: pass # default to sumo-gui (if True) or sumo (if False) else: raise FatalFlowError( 'Mode %s is not supported!' % self.sim_params.render) atexit.register(self.terminate)
def __init__(self, name, generator_class, vehicles, net_params, initial_config=InitialConfig()): """ Abstract base class. Initializes a new scenario. This class can be instantiated once and reused in multiple experiments. Note that this function stores all the relevant parameters. The generate() function still needs to be called separately. Attributes ---------- name: str A tag associated with the scenario generator_class: Generator type Class for generating configuration and net files with placed vehicles, e.g. CircleGenerator vehicles: Vehicles type see flow/core/vehicles.py net_params: NetParams type see flow/core/params.py initial_config: InitialConfig type see flow/core/params.py Raises ------ ValueError If no "length" is provided in net_params """ Serializable.quick_init(self, locals()) self.name = name self.generator_class = generator_class self.vehicles = vehicles self.net_params = net_params self.initial_config = initial_config # parameters to be specified under each unique subclass's # __init__() function self.edgestarts = self.specify_edge_starts() # these optional parameters need only be used if "no-internal-links" # is set to "false" while calling sumo's netconvert function self.internal_edgestarts = self.specify_internal_edge_starts() self.intersection_edgestarts = self.specify_intersection_edge_starts() # in case the user did not write the intersection edge-starts in # internal edge-starts as well (because of redundancy), merge the two # together self.internal_edgestarts += self.intersection_edgestarts seen = set() self.internal_edgestarts = \ [item for item in self.internal_edgestarts if item[1] not in seen and not seen.add(item[1])] # total_edgestarts and total_edgestarts_dict contain all of the above # edges, with the former being ordered by position if self.net_params.no_internal_links: self.total_edgestarts = self.edgestarts else: self.total_edgestarts = self.edgestarts + self.internal_edgestarts self.total_edgestarts.sort(key=lambda tup: tup[1]) self.total_edgestarts_dict = dict(self.total_edgestarts) # length of the network, or the portion of the network in which cars are # meant to be distributed (to be calculated during subclass __init__(), # or specified in net_params) if not hasattr(self, "length"): if "length" in self.net_params.additional_params: self.length = self.net_params.additional_params["length"] else: raise ValueError("The network does not have a specified length.") # generate starting position for vehicles in the network if self.initial_config.positions is None: self.initial_config.positions, self.initial_config.lanes = \ self.generate_starting_positions() self.cfg = self.generate()
def __getstate__(self): d = Serializable.__getstate__(self) global load_params if load_params: d["params"] = self.get_param_values(all_params=True) return d
def __init__(self, goal=None, *args, **kwargs): self._goal_vel = goal super(HalfCheetahEnvRand, self).__init__(*args, **kwargs) Serializable.__init__(self, *args, **kwargs)
def __init__( self, base_kwargs, env, policy, initial_exploration_policy, qf1, qf2, vf, pool, plotter=None, lr=3e-3, scale_reward=1, scale_entropy=1, discount=0.99, tau=0.01, target_update_interval=1, action_prior='uniform', reparameterize=False, save_full_state=False, ): """ Args: base_kwargs (dict): dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. policy: (`rllab.NNPolicy`): A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. qf1 (`valuefunction`): First Q-function approximator. qf2 (`valuefunction`): Second Q-function approximator. Usage of two Q-functions improves performance by reducing overestimation bias. vf (`ValueFunction`): Soft value function approximator. pool (`PoolBase`): Replay buffer to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. save_full_state (`bool`): If True, save the full class in the snapshot. See `self.get_snapshot` for more information. """ Serializable.quick_init(self, locals()) super(SAC, self).__init__(**base_kwargs) self._env = env self._policy = policy self._initial_exploration_policy = initial_exploration_policy self._qf1 = qf1 self._qf2 = qf2 self._vf = vf self._pool = pool self._plotter = plotter self._policy_lr = lr self._qf_lr = lr self._vf_lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior # Reparameterize parameter must match between the algorithm and the # policy actions are sampled from. assert reparameterize == self._policy._reparameterize self._reparameterize = reparameterize self._save_full_state = save_full_state self._Da = self._env.action_space.flat_dim self._Do = self._env.observation_space.flat_dim self._training_ops = list() self._init_placeholders() self._init_actor_update() self._init_critic_update() self._init_target_ops() # Initialize all uninitialized variables. This prevents initializing # pre-trained policy and qf and vf variables. uninit_vars = [] for var in tf.global_variables(): try: self._sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) self._sess.run(tf.variables_initializer(uninit_vars))
def __init__(self, *args, **kwargs): super(AntEnv, self).__init__(*args, **kwargs) Serializable.__init__(self, *args, **kwargs)
def __init__( self, name, input_shape, output_dim, hidden_sizes, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_nonlinearity=NL.rectify, mean_network=None, optimizer=None, use_trust_region=True, step_size=0.01, subsample_factor=1.0, batchsize=None, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_conv_filters=[], std_conv_filters_sizes=[], std_conv_strides=[], std_conv_pads=[], std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, ): """ :param input_shape: usually for images of the form (width,height,channel) :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer("optimizer") else: optimizer = LbfgsOptimizer("optimizer") self._optimizer = optimizer self.input_shape = input_shape if mean_network is None: mean_network = ConvNetwork( name="mean_network", input_shape=input_shape, output_dim=output_dim, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = ConvNetwork( name="log_std_network", input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, conv_filters=std_conv_filters, conv_filter_sizes=std_conv_filter_sizes, conv_strides=std_conv_strides, conv_pads=std_conv_pads, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=output_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LasagnePowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = TT.matrix("ys") old_means_var = TT.matrix("old_means") old_log_stds_var = TT.matrix("old_log_stds") x_mean_var = theano.shared( np.zeros((1, np.prod(input_shape)), dtype=theano.config.floatX), name="x_mean", broadcastable=(True, False), ) x_std_var = theano.shared( np.ones((1, np.prod(input_shape)), dtype=theano.config.floatX), name="x_std", broadcastable=(True, False), ) y_mean_var = theano.shared(np.zeros((1, output_dim), dtype=theano.config.floatX), name="y_mean", broadcastable=(True, False)) y_std_var = theano.shared(np.ones((1, output_dim), dtype=theano.config.floatX), name="y_std", broadcastable=(True, False)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + TT.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = TT.mean( dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = - \ TT.mean(dist.log_likelihood_sym( normalized_ys_var, normalized_dist_info_vars)) self._f_predict = compile_function([xs_var], means_var) self._f_pdists = compile_function([xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[normalized_means_var, normalized_log_stds_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var ] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var self._subsample_factor = subsample_factor self._batchsize = batchsize
def __init__(self, name, input_shape, extra_input_shape, output_dim, hidden_sizes, conv_filters, conv_filter_sizes, conv_strides, conv_pads, extra_hidden_sizes=None, hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(), output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, input_var=None, input_layer=None): Serializable.quick_init(self, locals()) if extra_hidden_sizes is None: extra_hidden_sizes = [] with tf.variable_scope(name): input_flat_dim = np.prod(input_shape) extra_input_flat_dim = np.prod(extra_input_shape) total_input_flat_dim = input_flat_dim + extra_input_flat_dim if input_layer is None: l_in = L.InputLayer(shape=(None, total_input_flat_dim), input_var=input_var, name="input") else: l_in = input_layer l_conv_in = L.reshape(L.SliceLayer(l_in, indices=slice(input_flat_dim), name="conv_slice"), ([0], ) + input_shape, name="conv_reshaped") l_extra_in = L.reshape(L.SliceLayer(l_in, indices=slice( input_flat_dim, None), name="extra_slice"), ([0], ) + extra_input_shape, name="extra_reshaped") l_conv_hid = l_conv_in for idx, conv_filter, filter_size, stride, pad in zip( range(len(conv_filters)), conv_filters, conv_filter_sizes, conv_strides, conv_pads, ): l_conv_hid = L.Conv2DLayer( l_conv_hid, num_filters=conv_filter, filter_size=filter_size, stride=(stride, stride), pad=pad, nonlinearity=hidden_nonlinearity, name="conv_hidden_%d" % idx, ) l_extra_hid = l_extra_in for idx, hidden_size in enumerate(extra_hidden_sizes): l_extra_hid = L.DenseLayer( l_extra_hid, num_units=hidden_size, nonlinearity=hidden_nonlinearity, name="extra_hidden_%d" % idx, W=hidden_W_init, b=hidden_b_init, ) l_joint_hid = L.concat( [L.flatten(l_conv_hid, name="conv_hidden_flat"), l_extra_hid], name="joint_hidden") for idx, hidden_size in enumerate(hidden_sizes): l_joint_hid = L.DenseLayer( l_joint_hid, num_units=hidden_size, nonlinearity=hidden_nonlinearity, name="joint_hidden_%d" % idx, W=hidden_W_init, b=hidden_b_init, ) l_out = L.DenseLayer( l_joint_hid, num_units=output_dim, nonlinearity=output_nonlinearity, name="output", W=output_W_init, b=output_b_init, ) self._l_in = l_in self._l_out = l_out LayersPowered.__init__(self, [l_out], input_layers=[l_in])
def __init__( self, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, mean_network=None, std_network=None, dist_cls=DiagonalGaussian, is_protagonist=True, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :return: """ Serializable.quick_init(self, locals()) if is_protagonist==True: cur_action_space = env_spec.pro_action_space; else: cur_action_space = env_spec.adv_action_space assert isinstance(cur_action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = cur_action_space.flat_dim # create network if mean_network is None: mean_network = MLP( input_shape=(obs_dim,), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_log_std = std_network.output_layer else: if adaptive_std: std_network = MLP( input_shape=(obs_dim,), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_log_std = std_network.output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self.min_std = min_std mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(min_std)) self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_log_std = l_log_std self._dist = dist_cls(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__(self, n_apples=8, n_bombs=8, activity_range=6., robot_object_spacing=2., catch_range=1., n_bins=10, sensor_range=6., sensor_span=math.pi, coef_inner_rew=0., dying_cost=-10, *args, **kwargs): Serializable.quick_init(self, locals()) self.n_apples = n_apples self.n_bombs = n_bombs self.activity_range = activity_range self.robot_object_spacing = robot_object_spacing self.catch_range = catch_range self.n_bins = n_bins self.sensor_range = sensor_range self.sensor_span = sensor_span self.coef_inner_rew = coef_inner_rew self.dying_cost = dying_cost self.objects = [] self.viewer = None # for openai baseline self.reward_range = (-float('inf'), float('inf')) self.metadata = None # super(GatherEnv, self).__init__(*args, **kwargs) model_cls = self.__class__.MODEL_CLASS if model_cls is None: raise "MODEL_CLASS unspecified!" xml_path = osp.join(MODEL_DIR, model_cls.FILE) tree = ET.parse(xml_path) worldbody = tree.find(".//worldbody") attrs = dict(type="box", conaffinity="1", rgba="0.8 0.9 0.8 1", condim="3") walldist = self.activity_range + 1 ET.SubElement( worldbody, "geom", dict(attrs, name="wall1", pos="0 -%d 0" % walldist, size="%d.5 0.5 1" % walldist)) ET.SubElement( worldbody, "geom", dict(attrs, name="wall2", pos="0 %d 0" % walldist, size="%d.5 0.5 1" % walldist)) ET.SubElement( worldbody, "geom", dict(attrs, name="wall3", pos="-%d 0 0" % walldist, size="0.5 %d.5 1" % walldist)) ET.SubElement( worldbody, "geom", dict(attrs, name="wall4", pos="%d 0 0" % walldist, size="0.5 %d.5 1" % walldist)) # _, file_path = tempfile.mkstemp(text=True) #todo: note that this is different from snn4hrl default if 'param_name' in kwargs.keys( ): # added because of ec2 empty xml issue file_path = osp.join( config.PROJECT_PATH, "sandbox/snn4hrl/envs/mujoco/gather/mujoco_models/" + model_cls.FILE.split(".")[0] + "_" + kwargs['param_name'] + "_gather.xml") else: file_path = osp.join( config.PROJECT_PATH, "sandbox/snn4hrl/envs/mujoco/gather/mujoco_models/" + model_cls.FILE.split(".")[0] + "_gather.xml") if not osp.exists(file_path): # create file if not there with open(file_path, 'w+'): pass tree.write(file_path) # pylint: disable=not-callable inner_env = model_cls( *args, file_path=file_path, **kwargs) # giving problems because of this weird tempfile # pylint: enable=not-callable ProxyEnv.__init__( self, inner_env) # to access the inner env, do self.wrapped_env # optimization, caching obs spaces ub = BIG * np.ones(self.get_current_obs().shape) self.obs_space = spaces.Box(ub * -1, ub) ub = BIG * np.ones(self.get_current_robot_obs().shape) self.robot_obs_space = spaces.Box(ub * -1, ub) ub = BIG * np.ones(np.concatenate(self.get_readings()).shape) self.maze_obs_space = spaces.Box(ub * -1, ub)
def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff self._goal_vel = None super(SwimmerRandGoalOracleEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())