def __init__(self, desc='two-state', map_id=None): self._map_id = map_id Serializable.quick_init(self, locals()) if isinstance(desc, str): desc = MAPS[desc] self.desc_choices = desc self.reset()
def __init__( self, update_method=lasagne.updates.adam, learning_rate=1e-3, max_epochs=1000, tolerance=1e-6, batch_size=32, callback=None, verbose=False, **kwargs): """ :param max_epochs: :param tolerance: :param update_method: :param batch_size: None or an integer. If None the whole dataset will be used. :param callback: :param kwargs: :return: """ Serializable.quick_init(self, locals()) self._opt_fun = None self._target = None self._callback = callback update_method = partial(update_method, learning_rate=learning_rate) self._update_method = update_method self._max_epochs = max_epochs self._tolerance = tolerance self._batch_size = batch_size self._verbose = verbose
def __init__( self, epsilon=0.5, L2_reg_dual=0., # 1e-5, L2_reg_loss=0., max_opt_itr=50, optimizer=scipy.optimize.fmin_l_bfgs_b, **kwargs): """ :param epsilon: Max KL divergence between new policy and old policy. :param L2_reg_dual: Dual regularization :param L2_reg_loss: Loss regularization :param max_opt_itr: Maximum number of batch optimization iterations. :param optimizer: Module path to the optimizer. It must support the same interface as scipy.optimize.fmin_l_bfgs_b. :return: """ Serializable.quick_init(self, locals()) super(REPS, self).__init__(**kwargs) self.epsilon = epsilon self.L2_reg_dual = L2_reg_dual self.L2_reg_loss = L2_reg_loss self.max_opt_itr = max_opt_itr self.optimizer = optimizer self.opt_info = None
def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__(self, env, ma_mode): Serializable.quick_init(self, locals()) self.env = env if hasattr(env, 'id'): self.env_id = env.id else: self.env_id = 'MA-Wrapper-v0' if ma_mode == 'centralized': obsfeat_space = convert_gym_space(env.agents[0].observation_space, n_agents=len(env.agents)) action_space = convert_gym_space(env.agents[0].action_space, n_agents=len(env.agents)) elif ma_mode in ['decentralized', 'concurrent']: obsfeat_space = convert_gym_space(env.agents[0].observation_space, n_agents=1) action_space = convert_gym_space(env.agents[0].action_space, n_agents=1) else: raise NotImplementedError self._observation_space = obsfeat_space self._action_space = action_space if hasattr(env, 'timestep_limit'): self._horizon = env.timestep_limit else: self._horizon = 250
def __init__( self, cg_iters=10, reg_coeff=1e-5, subsample_factor=0.1, backtrack_ratio=0.8, max_backtracks=15, debug_nan=False ): """ :param cg_iters: The number of CG iterations used to calculate A^-1 g :param reg_coeff: A small value so that A -> A + reg*I :param subsample_factor: Subsampling factor to reduce samples when using "conjugate gradient. Since the computation time for the descent direction dominates, this can greatly reduce the overall computation time. :param debug_nan: if set to True, NanGuard will be added to the compilation, and ipdb will be invoked when nan is detected :return: """ Serializable.quick_init(self, locals()) self._cg_iters = cg_iters self._reg_coeff = reg_coeff self._subsample_factor = subsample_factor self._backtrack_ratio = backtrack_ratio self._max_backtracks = max_backtracks self._opt_fun = None self._target = None self._max_constraint_val = None self._constraint_name = None self._debug_nan = debug_nan
def __init__(self, regressors): """ :param regressors: List of individual regressors """ Serializable.quick_init(self, locals()) self.regressors = regressors self.output_dims = [x.output_dim for x in regressors]
def __init__(self, mdp_cls, mdp_args): Serializable.quick_init(self, locals()) self.mdp_cls = mdp_cls self.mdp_args = dict(mdp_args) self.mdp_args["template_args"] = dict(noise=True) mdp = self.gen_mdp() super(IdentificationEnv, self).__init__(mdp)
def __init__( self, env, policy, n_itr=500, max_path_length=500, discount=0.99, sigma0=1., batch_size=None, plot=False, **kwargs ): """ :param n_itr: Number of iterations. :param max_path_length: Maximum length of a single rollout. :param batch_size: # of samples from trajs from param distribution, when this is set, n_samples is ignored :param discount: Discount. :param plot: Plot evaluation run after each iteration. :param sigma0: Initial std for param dist :return: """ Serializable.quick_init(self, locals()) self.env = env self.policy = policy self.plot = plot self.sigma0 = sigma0 self.discount = discount self.max_path_length = max_path_length self.n_itr = n_itr self.batch_size = batch_size
def __init__(self, obj, method_name, args, kwargs): self._serializable_initialized = False Serializable.quick_init(self, locals()) self.obj = obj self.method_name = method_name self.args = args self.kwargs = kwargs
def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True, force_reset=False): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) self.env = env self.env_id = env.spec.id monitor_manager.logger.setLevel(logging.WARNING) assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) self._action_space = convert_gym_space(env.action_space) self._horizon = env.spec.timestep_limit self._log_dir = log_dir self._force_reset = force_reset
def __init__(self, name, max_opt_itr=20, callback=None): Serializable.quick_init(self, locals()) self._name = name self._max_opt_itr = max_opt_itr self._opt_fun = None self._target = None self._callback = callback
def __init__( self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff super(SwimmerEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__( self, name, max_opt_itr=20, initial_penalty=1.0, min_penalty=1e-2, max_penalty=1e6, increase_penalty_factor=2, decrease_penalty_factor=0.5, max_penalty_itr=10, adapt_penalty=True): Serializable.quick_init(self, locals()) self._name = name self._max_opt_itr = max_opt_itr self._penalty = initial_penalty self._initial_penalty = initial_penalty self._min_penalty = min_penalty self._max_penalty = max_penalty self._increase_penalty_factor = increase_penalty_factor self._decrease_penalty_factor = decrease_penalty_factor self._max_penalty_itr = max_penalty_itr self._adapt_penalty = adapt_penalty self._opt_fun = None self._target = None self._max_constraint_val = None self._constraint_name = None
def __init__(self, env, obs_noise=1e-1, ): super(NoisyObservationEnv, self).__init__(env) Serializable.quick_init(self, locals()) self.obs_noise = obs_noise
def __init__(self, goal_reward=10, actuation_cost_coeff=30, distance_cost_coeff=1, init_sigma=0.1): super().__init__() Serializable.quick_init(self, locals()) self.dynamics = PointDynamics(dim=2, sigma=0) self.init_mu = np.zeros(2, dtype=np.float32) self.init_sigma = init_sigma self.goal_positions = np.array( [ [5, 0], [-5, 0], [0, 5], [0, -5] ], dtype=np.float32 ) self.goal_threshold = 1. self.goal_reward = goal_reward self.action_cost_coeff = actuation_cost_coeff self.distance_cost_coeff = distance_cost_coeff self.xlim = (-7, 7) self.ylim = (-7, 7) self.vel_bound = 1. self.reset() self.observation = None self._ax = None self._env_lines = [] self.fixed_plots = None self.dynamic_plots = []
def __init__( self, observation_space, action_space): Serializable.quick_init(self, locals()) self._observation_space = observation_space self._action_space = action_space
def __init__(self, env_spec, obs_pl, action, scope_name=None): Serializable.quick_init(self, locals()) self._obs_pl = obs_pl self._action = action self._scope_name = (tf.get_variable_scope().name if not scope_name else scope_name) super(NNPolicy, self).__init__(env_spec)
def __init__( self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff self._goal_vel = None super(SwimmerRandGoalEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__(self, env_spec, max_sigma=1.0, min_sigma=0.1, decay_period=1000000): assert isinstance(env_spec.action_space, Box) assert len(env_spec.action_space.shape) == 1 Serializable.quick_init(self, locals()) self._max_sigma = max_sigma self._min_sigma = min_sigma self._decay_period = decay_period self._action_space = env_spec.action_space
def __init__(self, *inputs, name, hidden_layer_sizes): Parameterized.__init__(self) Serializable.quick_init(self, locals()) self._name = name self._inputs = inputs self._layer_sizes = list(hidden_layer_sizes) + [1] self._output = self._output_for(*self._inputs)
def __init__(self, env, action_delay=3, ): assert action_delay > 0, "Should not use this env transformer" super(DelayedActionEnv, self).__init__(env) Serializable.quick_init(self, locals()) self.action_delay = action_delay self._queued_actions = None
def __init__(self, max_opt_itr=20, batch_size=32, cg_batch_size=100, callback=None): Serializable.quick_init(self, locals()) self._max_opt_itr = max_opt_itr self._opt_fun = None self._target = None self._batch_size = batch_size self._cg_batch_size = cg_batch_size self._hf_optimizer = None self._callback = callback
def __init__( self, alive_coeff=1, ctrl_cost_coeff=0.01, *args, **kwargs): self.alive_coeff = alive_coeff self.ctrl_cost_coeff = ctrl_cost_coeff super(HopperEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__(self, desc_str='4x4', max_traj_length=10, goal_reward=10.0): Serializable.quick_init(self, locals()) self.desc_str = desc_str # Map will be loaded in `self.reset` self.max_traj_length = max_traj_length self.n_row, self.n_col = np.array(map(list, self._fetch_map())).shape self.state = None self.goal_reward = goal_reward
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, action_merge_layer=-2, output_nonlinearity=None, bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer( l_hidden, num_units=size, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1) ) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer( l_hidden, num_units=1, nonlinearity=output_nonlinearity, name="output" ) output_var = L.get_output(l_output, deterministic=True) self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity LayersPowered.__init__(self, [l_output])
def __init__(self, env_spec, mu=0, theta=0.15, sigma=0.3, **kwargs): assert isinstance(env_spec.action_space, Box) assert len(env_spec.action_space.shape) == 1 Serializable.quick_init(self, locals()) self.mu = mu self.theta = theta self.sigma = sigma self.action_space = env_spec.action_space self.state = np.ones(self.action_space.flat_dim) * self.mu self.reset()
def __init__(self, desc='4x4'): Serializable.quick_init(self, locals()) if isinstance(desc, basestring): desc = MAPS[desc] self.desc = desc = np.array(map(list, desc)) self.n_row, self.n_col = desc.shape (start_x,), (start_y,) = np.nonzero(desc == 'S') self.start_state = start_x * self.n_col + start_y self.state = None self.domain_fig = None
def __init__( self, optimizer=None, optimizer_args=None, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: if optimizer_args is None: optimizer_args = dict() optimizer = ConjugateGradientOptimizer(**optimizer_args) super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
def __init__( self, observation_space, action_space): """ :type observation_space: Space :type action_space: Space """ Serializable.quick_init(self, locals()) self._observation_space = observation_space self._action_space = action_space
def __init__( self, base_kwargs, env, policy, initial_exploration_policy, qf1, qf2, vf, pool, plotter=None, lr=3e-3, scale_reward=1, discount=0.99, tau=0.01, target_update_interval=1, action_prior='uniform', reparameterize=False, save_full_state=False, ): """ Args: base_kwargs (dict): dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. policy: (`rllab.NNPolicy`): A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. qf1 (`valuefunction`): First Q-function approximator. qf2 (`valuefunction`): Second Q-function approximator. Usage of two Q-functions improves performance by reducing overestimation bias. vf (`ValueFunction`): Soft value function approximator. pool (`PoolBase`): Replay buffer to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. save_full_state (`bool`): If True, save the full class in the snapshot. See `self.get_snapshot` for more information. """ Serializable.quick_init(self, locals()) super(SAC, self).__init__(**base_kwargs) self._env = env self._policy = policy self._initial_exploration_policy = initial_exploration_policy self._qf1 = qf1 self._qf2 = qf2 self._vf = vf self._pool = pool self._plotter = plotter self._policy_lr = lr self._qf_lr = lr self._vf_lr = lr self._scale_reward = scale_reward self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior # Reparameterize parameter must match between the algorithm and the # policy actions are sampled from. assert reparameterize == self._policy._reparameterize self._reparameterize = reparameterize self._save_full_state = save_full_state self._Da = self._env.action_space.flat_dim self._Do = self._env.observation_space.flat_dim self._training_ops = list() self._init_placeholders() self._init_actor_update() self._init_critic_update() self._init_target_ops() # Initialize all uninitialized variables. This prevents initializing # pre-trained policy and qf and vf variables. uninit_vars = [] for var in tf.global_variables(): try: self._sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) self._sess.run(tf.variables_initializer(uninit_vars))
def __init__(self, max_opt_itr=20, callback=None): Serializable.quick_init(self, locals()) self._max_opt_itr = max_opt_itr self._opt_fun = None self._target = None self._callback = callback
def __init__( self, env_spec, policy, recurrent=False, predict_all=True, obs_regressed='all', act_regressed='all', use_only_sign=False, noisify_traj_coef=0, optimizer=None, # this defaults to LBFGS regressor_args=None, # here goes all args straight to the regressor: hidden_sizes, TR, step_size.... ): """ :param predict_all: this is only for the recurrent case, to use all hidden states as predictions :param obs_regressed: list of index of the obs variables used to fit the regressor. default string 'all' :param act_regressed: list of index of the act variables used to fit the regressor. default string 'all' :param regressor_args: """ self.env_spec = env_spec self.policy = policy self.latent_dim = policy.latent_dim self.recurrent = recurrent self.predict_all = predict_all self.use_only_sign = use_only_sign self.noisify_traj_coef = noisify_traj_coef self.regressor_args = regressor_args # decide what obs variables will be regressed upon if obs_regressed == 'all': self.obs_regressed = list( range(env_spec.observation_space.flat_dim)) else: self.obs_regressed = obs_regressed # decide what action variables will be regressed upon if act_regressed == 'all': self.act_regressed = list(range(env_spec.action_space.flat_dim)) else: self.act_regressed = act_regressed # shape the input dimension of the NN for the above decisions. self.obs_act_dim = len(self.obs_regressed) + len(self.act_regressed) Serializable.quick_init(self, locals()) # ?? if regressor_args is None: regressor_args = dict() if optimizer == 'first_order': self.optimizer = FirstOrderOptimizer( max_epochs=10, # both of these are to match Rocky's 10 batch_size=128, ) elif optimizer is None: self.optimizer = None else: raise NotImplementedError if policy.latent_name == 'bernoulli': if self.recurrent: self._regressor = BernoulliRecurrentRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, predict_all=self.predict_all, **regressor_args) else: self._regressor = BernoulliMLPRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, **regressor_args) elif policy.latent_name == 'categorical': if self.recurrent: self._regressor = CategoricalRecurrentRegressor( # not implemented input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, # predict_all=self.predict_all, **regressor_args) else: self._regressor = CategoricalMLPRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, **regressor_args) elif policy.latent_name == 'normal': self._regressor = GaussianMLPRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, **regressor_args) else: raise NotImplementedError
def __init__( self, base_kwargs, env, policy, qf, vf, pool, plotter=None, lr=3e-3, scale_reward=1, discount=0.99, tau=0.01, target_update_interval=1, action_prior='uniform', save_full_state=False, #my entropy_coeff=1., dynamic_coeff=False, clip_norm=None, resolution=25, test_N=1000, # the number of action samples to estimate Q variance ): """ Args: base_kwargs (dict): dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. policy: (`rllab.NNPolicy`): A policy function approximator. qf (`ValueFunction`): Q-function approximator. vf (`ValueFunction`): Soft value function approximator. pool (`PoolBase`): Replay buffer to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. scale_reward (`float`): Scaling factor for raw reward. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. save_full_state (`bool`): If True, save the full class in the snapshot. See `self.get_snapshot` for more information. """ Serializable.quick_init(self, locals()) super(SAC, self).__init__(**base_kwargs) self._env = env self._policy = policy self._qf = qf self._vf = vf self._pool = pool self._plotter = plotter self._policy_lr = lr self._qf_lr = lr self._vf_lr = lr self._scale_reward = scale_reward self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._save_full_state = save_full_state self._Da = self._env.spec.action_space.flat_dim self._Do = self._env.spec.observation_space.flat_dim self._training_ops = list() # my self._loss_ops = [] self._ec = tf.Variable(entropy_coeff, name='entropy_coeff') self.dynamic_ec = dynamic_coeff self.clip_norm = clip_norm self._init_placeholders() self._init_actor_update() self._init_critic_update() self._init_target_ops() # my self.resolution = resolution self.test_N = test_N if self.env.observation_space.flat_dim <= 2: self._init_state_importance() # Initialize all uninitialized variables. This prevents initializing # pre-trained policy and qf and vf variables. uninit_vars = [] for var in tf.global_variables(): try: self._sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) self._sess.run(tf.variables_initializer(uninit_vars)) # my self._saver = tf.train.Saver()
def __init__( self, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, mean_network=None, std_network=None, dist_cls=DiagonalGaussian, output_gain=1, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( input_shape=(obs_dim,), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, output_W_init=LI.GlorotUniform(gain=output_gain) ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_log_std = std_network.output_layer else: if adaptive_std: std_network = MLP( input_shape=(obs_dim,), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_log_std = std_network.output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self.min_std = min_std self._set_std_to_0 = False mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(min_std)) self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_log_std = l_log_std self._dist = dist_cls(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__(self, env_params, sumo_params, scenario): # Invoke serializable if using rllab if serializable_flag: Serializable.quick_init(self, locals()) self.env_params = env_params self.scenario = scenario self.sumo_params = sumo_params time_stamp = ''.join(str(time.time()).split('.')) if os.environ.get("TEST_FLAG", 0): # 1.0 works with stress_test_start 10k times time.sleep(1.0 * int(time_stamp[-6:]) / 1e6) self.sumo_params.port = sumolib.miscutils.getFreeSocketPort() self.vehicles = scenario.vehicles self.traffic_lights = scenario.traffic_lights # time_counter: number of steps taken since the start of a rollout self.time_counter = 0 # step_counter: number of total steps taken self.step_counter = 0 # initial_state: # Key = Vehicle ID, # Entry = (type_id, route_id, lane_index, lane_pos, speed, pos) self.initial_state = {} self.state = None self.obs_var_labels = [] # simulation step size self.sim_step = sumo_params.sim_step self.vehicle_arrangement_shuffle = \ env_params.vehicle_arrangement_shuffle self.starting_position_shuffle = env_params.starting_position_shuffle # the available_routes variable contains a dictionary of routes # vehicles can traverse; to be used when routes need to be chosen # dynamically self.available_routes = self.scenario.rts # TraCI connection used to communicate with sumo self.traci_connection = None # dictionary of initial observations used while resetting vehicles # after each rollout self.initial_observations = dict.fromkeys(self.vehicles.get_ids()) # store the initial vehicle ids self.initial_ids = deepcopy(self.vehicles.get_ids()) # store the initial state of the vehicles class (for restarting sumo) self.initial_vehicles = deepcopy(self.vehicles) # colors used to distinguish between types of vehicles in the network self.colors = {} # contains the subprocess.Popen instance used to start traci self.sumo_proc = None self.start_sumo() self.setup_initial_state() # use pyglet to render the simulation if self.sumo_params.render in ['gray', 'dgray', 'rgb', 'drgb']: save_render = self.sumo_params.save_render sight_radius = self.sumo_params.sight_radius pxpm = self.sumo_params.pxpm show_radius = self.sumo_params.show_radius # get network polygons network = [] for lane_id in self.traci_connection.lane.getIDList(): _lane_poly = self.traci_connection.lane.getShape(lane_id) lane_poly = [i for pt in _lane_poly for i in pt] network.append(lane_poly) # instantiate a pyglet renderer self.renderer = Renderer(network, self.sumo_params.render, save_render, sight_radius=sight_radius, pxpm=pxpm, show_radius=show_radius) # render a frame self.render(reset=True) elif self.sumo_params.render in [True, False]: pass # default to sumo-gui (if True) or sumo (if False) else: raise ValueError("Mode %s is not supported!" % self.sumo_params.render)
def __init__( self, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.imatrix("ys") old_prob_var = TT.matrix("old_prob") x_mean_var = theano.shared( np.zeros((1,) + input_shape), name="x_mean", broadcastable=(True,) + (False, ) * len(input_shape) ) x_std_var = theano.shared( np.ones((1,) + input_shape), name="x_std", broadcastable=(True,) + (False, ) * len(input_shape) ) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical() mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim) self._f_predict = ext.compile_function([xs_var], predicted) self._f_prob = ext.compile_function([xs_var], prob_var) self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
def __init__( self, env_spec, hidden_sizes=(32,), state_include_action=True, hidden_nonlinearity=NL.tanh, learn_std=True, init_std=1.0, output_nonlinearity=None, trunc_steps=20, ): """ :param env_spec: A spec for the env. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ Serializable.quick_init(self, locals()) super(GaussianGRUPolicy, self).__init__(env_spec) assert len(hidden_sizes) == 1 if state_include_action: obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim else: obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim mean_network = GRUNetwork( input_shape=(obs_dim,), output_dim=action_dim, hidden_dim=hidden_sizes[0], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, trunc_steps=trunc_steps ) l_mean = mean_network.output_layer obs_var = mean_network.input_var l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) l_step_log_std = ParamLayer( mean_network.step_input_layer, num_units=action_dim, param=l_log_std.param, name="step_output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._l_log_std = l_log_std self._state_include_action = state_include_action self._f_step_mean_std = ext.compile_function( [ mean_network.step_input_layer.input_var, mean_network.step_prev_hidden_layer.input_var ], L.get_output([ mean_network.step_output_layer, l_step_log_std, mean_network.step_hidden_layer ]) ) self._prev_action = None self._prev_hidden = None self._hidden_sizes = hidden_sizes self._dist = RecurrentDiagonalGaussian(action_dim) self.reset() LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, max_std=1000.0, std_modifier=1.0, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, mean_network=None, std_network=None, std_parametrization='exp', grad_step_size=1.0, stop_grad=False, extra_input_dim=0, # metalearn_baseline=False, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1] :param stop_grad: whether or not to stop the gradient through the gradient. :return: """ Serializable.quick_init(self, locals()) #assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self.n_hidden = len(hidden_sizes) self.hidden_nonlinearity = hidden_nonlinearity self.output_nonlinearity = output_nonlinearity self.input_shape = ( None, obs_dim + extra_input_dim, ) self.step_size = grad_step_size self.stop_grad = stop_grad # self.metalearn_baseline = metalearn_baseline if type(self.step_size) == list: raise NotImplementedError('removing this since it didnt work well') # create network if mean_network is None: self.all_params = self.create_MLP( # TODO: this should not be a method of the policy! --> helper name="mean_network", output_dim=self.action_dim, hidden_sizes=hidden_sizes, ) self.input_tensor, _ = self.forward_MLP( 'mean_network', self.all_params, reuse=None # Need to run this for batch norm ) forward_mean = lambda x, params, is_train: self.forward_MLP( 'mean_network', all_params=params, input_tensor=x, is_training=is_train)[1] else: raise NotImplementedError('Not supported.') if std_network is not None: raise NotImplementedError('Not supported.') else: if adaptive_std: raise NotImplementedError('Not supported.') else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError self.all_params['std_param'] = make_param_layer( num_units=self.action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) forward_std = lambda x, params: forward_param_layer( x, params['std_param']) self.all_param_vals = None # unify forward mean and forward std into a single function self._forward = lambda obs, params, is_train: (forward_mean( obs, params, is_train), forward_std(obs, params)) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) max_std_param = np.log(max_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) max_std_param = np.log(np.exp(max_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param # TODO: change these to min_std_param_raw self.max_std_param = max_std_param self.std_modifier = np.float64(std_modifier) # print(self.std_modifier) # self.std_modifier = 0.00001 #np.float64(std_modifier) #print("initializing max_std debug4", self.min_std_param, self.max_std_param) self._dist = DiagonalGaussian(self.action_dim) self._cached_params = {} super(MAMLGaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(self.input_tensor, dict(), is_training=False) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] # pre-update policy self._init_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=[mean_var, log_std_var], ) self._cur_f_dist = self._init_f_dist
def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs): self.ctrl_cost_coeff = ctrl_cost_coeff super(SwimmerEnv, self).__init__(*args, **kwargs) Serializable.quick_init(self, locals())
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, mean_network=None, std_network=None, std_parametrization='exp' ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) with tf.variable_scope(name): obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( name="mean_network", input_shape=(obs_dim*2,), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_std_param = std_network.output_layer else: if adaptive_std: std_network = MLP( name="std_network", input_shape=(obs_dim*2,), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_std_param = std_network.output_layer else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError l_std_param = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param # mean_var, log_std_var = L.get_output([l_mean, l_std_param]) # # if self.min_std_param is not None: # log_std_var = tf.maximum(log_std_var, np.log(min_std)) # # self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_std_param = l_std_param self._dist = DiagonalGaussian(action_dim) LayersPowered.__init__(self, [l_mean, l_std_param]) super(GaussianMLPInversePolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(mean_network.input_layer.input_var, dict()) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._f_dist = tensor_utils.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, hidden_W_init=lasagne.init.HeUniform(), hidden_b_init=lasagne.init.Constant(0.), action_merge_layer=-2, output_nonlinearity=None, output_W_init=lasagne.init.Uniform(-3e-3, 3e-3), output_b_init=lasagne.init.Uniform(-3e-3, 3e-3), bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer( l_hidden, num_units=size, W=hidden_W_init, b=hidden_b_init, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1) ) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer( l_hidden, num_units=1, W=output_W_init, b=output_b_init, nonlinearity=output_nonlinearity, name="output" ) output_var = L.get_output(l_output, deterministic=True).flatten() self._f_qval = ext.compile_function([l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity LasagnePowered.__init__(self, [l_output])
def __init__( self, base_kwargs, env, arr_actor, best_actor, dict_ph, arr_initial_exploration_policy, with_best = False, initial_beta_t = 1, plotter=None, specific_type=0, target_noise_scale=0.2, target_noise_clip=0.5, target_ratio=2, target_range=0.04, lr=3e-3, discount=0.99, tau=0.01, policy_update_interval=2, best_update_interval=2, reparameterize=False, save_full_state=False, ): """ Args: base_kwargs (dict): dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. policy: (`rllab.NNPolicy`): A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. qf1 (`valuefunction`): First Q-function approximator. qf2 (`valuefunction`): Second Q-function approximator. Usage of two Q-functions improves performance by reducing overestimation bias. vf (`ValueFunction`): Soft value function approximator. pool (`PoolBase`): Replay buffer to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. save_full_state (`bool`): If True, save the full class in the snapshot. See `self.get_snapshot` for more information. """ Serializable.quick_init(self, locals()) super(P3S_TD3, self).__init__(**base_kwargs) self._env = env self._max_actions = int(self._env.action_space.high[0]) self._arr_actor = arr_actor self._best_actor = best_actor self._best_actor_num = -1 self._num_iter_select_best = 1 assert len(self._env.envs) == len(self._arr_actor) self._num_actor = len(self._arr_actor) self._n_train_repeat = self._num_actor self._dict_ph = dict_ph self._arr_initial_exploration_policy = arr_initial_exploration_policy self._with_best = with_best self._best_flag = np.ones(self._num_actor) self._beta_t = initial_beta_t self._plotter = plotter self._target_noise_scale = target_noise_scale self._target_noise_clip = target_noise_clip self._target_ratio = target_ratio self._target_range = target_range self._policy_lr = lr self._qf_lr = lr self._vf_lr = lr self._discount = discount self._tau = tau self._policy_update_interval = policy_update_interval self._best_update_interval = best_update_interval # Reparameterize parameter must match between the algorithm and the # policy actions are sampled from. self._save_full_state = save_full_state self._saver = tf.train.Saver(max_to_keep=1000) self._save_dir = '/home/wisrl/wyjung/Result/log/Mujoco/ant_delay20/test_IPE_TD3_NA4_TRatio2_Trange0.03_update1_ver3_new_201906/iter6/' # '/test_IPE_TD3_NA' + str(NUM_ACTORS) + '_TRatio' + str(TARGET_RATIO) + '_TRange' + str( # TARGET_RANGE) + '_update' + str(UPDATE_BEST_ITER) + '_ver' + str(VERSION) + '_new_201906' self._save_iter_num = 40000 self._Da = self._env.action_space.flat_dim self._Do = self._env.observation_space.flat_dim if self._best_actor is not None: self._init_critic_update(actor=self._best_actor) self._init_actor_update(actor=self._best_actor) self._init_target_ops(actor=self._best_actor) for actor in self._arr_actor: self._init_critic_update(actor=actor) self._init_actor_update(actor=actor) self._init_target_ops(actor=actor) self._init_update_old_new_ops(actor=actor) self._sess.run(tf.variables_initializer([ variable for variable in tf.global_variables() if 'low_level_policy' not in variable.name ])) self._update_old_new() for actor in self._arr_actor: source_params = actor.current_params() target_params = actor.target_params() copy_ops = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] self._sess.run(copy_ops) if self._best_actor is not None: source_params = self._best_actor.current_params() target_params = self._best_actor.target_params() copy_ops = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] self._sess.run(copy_ops) for actor in self._arr_actor: source_params = self._best_actor.trainable_params() target_params = actor.trainable_params() copy_ops = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] self._sess.run(copy_ops) print("Initialization is finished!")
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, mean_network=None, std_network=None, std_parametrization='exp', std_modifier=1.0, extra_input_dim=0, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: self.mean_params = mean_params = self.create_MLP( name="mean_network", input_shape=( None, obs_dim + extra_input_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, ) self.input_tensor, mean_tensor = self.forward_MLP( 'mean_network', mean_params, n_hidden=len(hidden_sizes), input_shape=(obs_dim, ), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, reuse=None # Needed for batch norm ) # if you want to input your own thing. self._forward_mean = lambda x, is_train: self.forward_MLP( 'mean_network', mean_params, n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_tensor=x, is_training=is_train)[1] else: raise NotImplementedError('Chelsea does not support this.') if std_network is not None: raise NotImplementedError( 'Minimal Gaussian MLP does not support this.') else: if adaptive_std: # NOTE - this branch isn't tested raise NotImplementedError( 'Minimal Gaussian MLP doesnt have a tested version of this.' ) self.std_params = std_params = self.create_MLP( name="std_network", input_shape=( None, obs_dim, ), output_dim=action_dim, hidden_sizes=std_hidden_sizes, ) # if you want to input your own thing. self._forward_std = lambda x: self.forward_MLP( 'std_network', std_params, n_hidden=len(hidden_sizes), hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=tf.identity, input_tensor=x)[1] else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError self.std_params = make_param_layer( num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self._forward_std = lambda x: forward_param_layer( x, self.std_params) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param self.std_modifier = std_modifier self._dist = DiagonalGaussian(action_dim) self._cached_params = {} super(GaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(self.input_tensor, dict(), is_training=False) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._init_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=[mean_var, log_std_var], ) self._cur_f_dist = self._init_f_dist
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy snn_pkl_path=None, snn_json_path=None, manager_pkl_path=None, # default is to initialize a new manager from scratch min_period=1, max_period=10, # possible periods latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, continuous_latent=False, hidden_sizes_snn=(64, 64), hidden_sizes_selector=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.periods = np.arange(min_period, max_period + 1) assert len(self.periods) > 0 self.curr_period = self.periods[0] self.max_period = max(self.periods) self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None # something self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager self.random_period = True self.fake_env = PeriodVaryingEnv(env) self.continuous_latent = continuous_latent self.trainable_snn = trainable_snn if pkl_path and '.npz' not in pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy # following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: # env spec that includes the extra parameter for time self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=self.fake_env.spec, env=self.fake_env, pkl_path=snn_pkl_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load( os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) if self.continuous_latent: outer_env_spec = EnvSpec( observation_space=self.fake_env.observation_space, action_space=spaces.Box(-1.0, 1.0, shape=(latent_dim, ))) self.manager = GaussianMLPPolicy(env_spec=outer_env_spec) else: outer_env_spec = EnvSpec( observation_space=self.fake_env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy( env_spec=outer_env_spec, latent_dim=latent_dim, ) if pkl_path is not None and '.npz' in pkl_path: param_dict = dict( np.load(os.path.join(config.PROJECT_PATH, pkl_path))) param_values = param_dict['params'] self.set_param_values(param_values) if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 Serializable.quick_init(self, locals()) # todo: ask if this fixes my problem
def __init__(self, env_spec, endpoints, outside_value): Serializable.quick_init(self, locals()) self._env_spec = env_spec self.schedule = schedules.PiecewiseSchedule( endpoints=endpoints, outside_value=outside_value)
def __init__(self, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=NL.tanh): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: TT.reshape( flat_feature, [input.shape[0], input.shape[1], feature_dim]), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)) prob_network = GRUNetwork(input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=TT.nnet.softmax, name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = TT.matrix("flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_prob = ext.compile_function( [flat_input_var, prob_network.step_prev_hidden_layer.input_var], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_action = None self.prev_hidden = None self.dist = RecurrentCategorical(env_spec.action_space.n) out_layers = [prob_network.output_layer] if feature_network is not None: out_layers.append(feature_network.output_layer) LasagnePowered.__init__(self, out_layers)