def configure_her(params): env = cached_make_env(params['make_env']) env.reset() if params['structure'] == 'flat': env.unwrapped.set_flat_env() def reward_fun(ag_2, g, task_descr, info): # vectorized return env.unwrapped.compute_reward(achieved_goal=ag_2, goal=g, task_descr=task_descr, info=info) # Prepare configuration for HER. her_params = { 'reward_fun': reward_fun, 'tasks_ag_id': params['tasks_ag_id'], 'tasks_g_id': params['tasks_g_id'], 'goal_replay': params['goal_replay'], 'her_replay_k': params['her_replay_k'], 'task_replay': params['task_replay'] } her_sampling_func = import_function(params['her_sampling_func']) sample_her_transitions = her_sampling_func(**her_params) return sample_her_transitions
def __init__(self, input_dims, buffer_size, hidden, layers, network_class_actor_critic, network_class_discriminator, polyak, batch_size, Q_lr, pi_lr, mi_lr, sk_lr, r_scale, mi_r_scale, sk_r_scale, et_r_scale, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, env_name, max_timesteps, pretrain_weights, finetune_pi, mi_prioritization, sac, reuse=False, history_len=10000, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function( self.network_class_actor_critic) self.create_discriminator = import_function( self.network_class_discriminator) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimz = self.input_dims['z'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.env_name = env_name # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) stage_shapes['w'] = (None, ) stage_shapes['m'] = (None, ) stage_shapes['s'] = (None, ) stage_shapes['m_w'] = () stage_shapes['s_w'] = () stage_shapes['r_w'] = () stage_shapes['e_w'] = () self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(pretrain_weights, mi_prioritization, reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, mi_prioritization) self.mi_r_history = deque(maxlen=history_len) self.gl_r_history = deque(maxlen=history_len) self.sk_r_history = deque(maxlen=history_len) self.et_r_history = deque(maxlen=history_len) self.mi_current = 0 self.finetune_pi = finetune_pi
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) global DEMO_BUFFER DEMO_BUFFER = ReplayBuffer( buffer_shapes, buffer_size, self.T, self.sample_transitions ) #initialize the demo buffer; in the same way as the primary data buffer
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, time_horizon, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False): """ Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u) :param buffer_size: (int) number of transitions that are stored in the replay buffer :param hidden: (int) number of units in the hidden layers :param layers: (int) number of hidden layers :param network_class: (str) the network class that should be used (e.g. 'baselines.her.ActorCritic') :param polyak: (float) coefficient for Polyak-averaging of the target network :param batch_size: (int) batch size for training :param q_lr: (float) learning rate for the Q (critic) network :param pi_lr: (float) learning rate for the pi (actor) network :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip] :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u] :param action_l2: (float) coefficient for L2 penalty on the actions :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs] :param scope: (str) the scope used for the TensorFlow graph :param time_horizon: (int) the time horizon for rollouts :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals from each other :param relative_goals: (boolean) whether or not relative goals should be fed into the network :param clip_pos_returns: (boolean) whether or not positive returns should be clipped :param clip_return: (float) clip returns to be in [-clip_return, clip_return] :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer :param gamma: (float) gamma used for Q learning updates :param reuse: (boolean) whether or not the networks should be reused """ # Updated in experiments/config.py self.input_dims = input_dims self.buffer_size = buffer_size self.hidden = hidden self.layers = layers self.network_class = network_class self.polyak = polyak self.batch_size = batch_size self.q_lr = q_lr self.pi_lr = pi_lr self.norm_eps = norm_eps self.norm_clip = norm_clip self.max_u = max_u self.action_l2 = action_l2 self.clip_obs = clip_obs self.scope = scope self.time_horizon = time_horizon self.rollout_batch_size = rollout_batch_size self.subtract_goals = subtract_goals self.relative_goals = relative_goals self.clip_pos_returns = clip_pos_returns self.clip_return = clip_return self.sample_transitions = sample_transitions self.gamma = gamma self.reuse = reuse if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dim_obs = self.input_dims['o'] self.dim_goal = self.input_dims['g'] self.dim_action = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.time_horizon if key != 'o' else self.time_horizon + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal) buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.time_horizon, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, action_scale, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, temperature, prioritization, env_name, alpha, beta0, beta_iters, total_timesteps, rank_method, reuse=False, **kwargs): """ Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: :param input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) :param buffer_size (int): number of transitions that are stored in the replay buffer :param hidden (int): number of units in the hidden layers :param layers (int): number of hidden layers :param network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') :param polyak (float): coefficient for Polyak-averaging of the target network :param batch_size (int): batch size for training :param Q_lr (float): learning rate for the Q (critic) network :param pi_lr (float): learning rate for the pi (actor) network :param norm_eps (float): a small value used in the normalizer to avoid numerical instabilities :param norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] :param action_scale(float): maximum action magnitude, i.e. actions are in [-max_u, max_u] :param action_l2 (float): coefficient for L2 penalty on the actions :param clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] :param scope (str): the scope used for the TensorFlow graph :param T (int): the time horizon for rollouts :param rollout_batch_size (int): number of parallel rollouts per DDPG agent :param subtract_goals (function): function that subtracts goals from each other :param relative_goals (boolean): whether or not relative goals should be fed into the network :param clip_pos_returns (boolean): whether or not positive returns should be clipped :param clip_return (float): clip returns to be in [-clip_return, clip_return] :param sample_transitions (function) function that samples from the replay buffer :param gamma (float): gamma used for Q learning updates :param reuse (boolean): whether or not the networks should be reused :param bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss :param q_filter: whether or not a filter on the q value update should be used when training with demonstartions :param num_demo: Number of episodes in to be used in the demonstration buffer :param demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread :param prm_loss_weight: Weight corresponding to the primary loss :param aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function( self.network_class) # points to actor_critic.py self.input_dims = input_dims input_shapes = dims_to_shapes(input_dims) self.dimo = input_dims['o'] self.dimg = input_dims['g'] self.dimu = input_dims['u'] self.sample_count = 1 self.cycle_count = 1 self.critic_loss_episode = [] self.actor_loss_episode = [] self.critic_loss_avg = [] self.actor_loss_avg = [] # Energy based parameters self.prioritization = prioritization self.env_name = env_name self.temperature = temperature self.rank_method = rank_method # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Creates DDPG agent # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size # print("begin init") if self.prioritization == 'energy': self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, self.prioritization, self.env_name) # elif self.prioritization == 'tderror': # self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha) # if beta_iters is None: # beta_iters = total_timesteps # self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) else: self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) global DEMO_BUFFER DEMO_BUFFER = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, replay_k, reward_fun=None, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf # Create the actor critic networks. network_class is defined in actor_critic.py # This class is assigned to network_class when DDPG objest is created self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) # Next state (o_2) and goal at next state (g_2) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) self.stage_shapes = stage_shapes # Adding variable for correcting bias - Ameet self.stage_shapes_new = OrderedDict() self.stage_shapes_new['bias'] = (None,) ############################################## # Create network # Staging area is a datatype in tf to input data into GPUs with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) # Adding bias term from section 3.4 - Ameet self.staging_tf_new = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes_new.keys()], shapes=list(self.stage_shapes_new.values())) self.buffer_ph_tf_new = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes_new.values()] self.stage_op_new = self.staging_tf_new.put(self.buffer_ph_tf_new) ############################################ self._create_network(reuse=reuse) # Configure the replay buffer buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size # conf represents the parameters required for initializing the priority_queue # Remember: The bias gets annealed only conf.total_steps number of times conf = {'size': self.buffer_size, 'learn_start': self.batch_size, 'batch_size': self.batch_size, # Using some heuristic to set the partition_num as it matters only when the buffer is not full (unlikely) 'partition_size': (self.replay_k)*100} self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, conf, self.replay_k) # global_steps represents the number of batches used for updates self.global_step = 0 self.debug = {}
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False, env=None, to_goal=None, nearby_action_penalty=False, nearby_penalty_weight=0, sample_expert=False, expert_batch_size=0., bc_loss=0., anneal_bc=0., terminate_bootstrapping=False, mask_q = False, two_qs=False, anneal_discriminator=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) if two_qs: stage_shapes['r2'] = (None,) stage_shapes['w_q2'] = (None, ) stage_shapes['successes'] = (None,) if nearby_action_penalty: stage_shapes['far_from_goal'] = (None, ) if sample_expert: stage_shapes['is_demo'] = (None, ) stage_shapes['annealing_factor'] = (None, ) self.stage_shapes = stage_shapes # Create network. # print(self.stage_shapes.keys()) with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_shapes['successes'] = (self.T,) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) self.expert_buffer = None self.all_variables = self._global_vars('') if to_goal is None: print("to goal is none!") self.to_goal = (0, 2) else: self.to_goal = to_goal self.to_goal_func = (lambda x: x[self.to_goal[0] : self.to_goal[1]]) if len(self.to_goal) == 2 else (lambda x: x[np.array(self.to_goal)]) self.nearby_action_penalty = nearby_action_penalty self.nearby_penalty_weight = nearby_penalty_weight
def __init__(self, input_dims, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, normalize_obs, sample_transitions, gamma, buffers=None, reuse=False, tasks_ag_id=None, tasks_g_id=None, task_replay='', t_id=None, eps_task=None, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused, buffers (list): buffers to be used to store new transition (usually one per task + 1 task_ag_id (list): indices to find achieved goals for each task in the achieved goal vector task_g_id (list): indices to find agoals for each task in the goal vector task_replay (str): defines the task replay strategy (see train.py for info) t_id (int): index of the task corresponding to this policy when using a task-experts structure eps_task (float): epsilon parameter for the epsilon greedy strategy (task choice) """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) self.normalize_obs = normalize_obs input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimag = self.input_dims['ag'] self.dimu = self.input_dims['u'] if self.structure == 'curious' or self.structure == 'task_experts': self.dimtd = self.input_dims['task_descr'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, 1) self.stage_shapes = stage_shapes if t_id is not None: self.scope += str(t_id) # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # addition for multi-task structures if self.structure == 'curious' or self.structure == 'task_experts': self.tasks_g_id = tasks_g_id self.tasks_ag_id = tasks_ag_id self.nb_tasks = len(tasks_g_id) if buffers is not None: self.buffer = buffers if type(self.buffer) is list: if len(self.buffer) > 5: # distractor buffers are equal for i in range(6, len(self.buffer)): self.buffer[i] = self.buffer[5] self.first = True