def setup_staging_areas(self): for idx, device in enumerate(self._devices): with tf.device(device): inputs = self._input.get_input_tensors() dtypes = [x.dtype for x in inputs] stage = StagingArea(dtypes, shapes=None) self._stage_ops.append(stage.put(inputs)) self._areas.append(stage) outputs = stage.get() for vin, vout in zip(inputs, outputs): vout.set_shape(vin.get_shape()) self._unstage_ops.append(outputs)
def stage_data(self, batch, memory_gb=1, n_threads=4): '''''' with tf.device('/gpu:0'): dtypes = [t.dtype for t in batch] shapes = [t.get_shape() for t in batch] SA = StagingArea(dtypes, shapes=shapes, memory_limit=memory_gb * 1e9) get, put, clear = SA.get(), SA.put(batch), SA.clear() tf.train.add_queue_runner( tf.train.QueueRunner(queue=SA, enqueue_ops=[put] * n_threads, close_op=clear, cancel_op=clear)) return get
def setup_staging_areas(self): for idx, device in enumerate(self._devices): with tf.device(device): inputs = self._input.get_input_tensors() dtypes = [x.dtype for x in inputs] stage = StagingArea(dtypes, shapes=None) self._stage_ops.append(stage.put(inputs)) self._areas.append(stage) outputs = stage.get() if isinstance( outputs, tf.Tensor): # when size=1, TF doesn't return a list outputs = [outputs] for vin, vout in zip(inputs, outputs): vout.set_shape(vin.get_shape()) self._unstage_ops.append(outputs)
def _prepare_staging(self): with tf.variable_scope('staging', reuse=tf.AUTO_REUSE): staging_area_tf = StagingArea( dtypes=[tf.float32 for _ in self._stage_shapes.keys()], shapes=[(None, *shape) for shape in self._stage_shapes.values()]) input_ph_tf = [ tf.placeholder(tf.float32, shape=(None, *shape)) for shape in self._stage_shapes.values() ] staging_op_tf = staging_area_tf.put(input_ph_tf) batch_tf = OrderedDict([ (key, batch_item) for key, batch_item in zip( self._stage_shapes.keys(), staging_area_tf.get()) ]) return staging_area_tf, input_ph_tf, staging_op_tf, batch_tf
def train_qdqn(config, log_dir, make_env, model, cleanup=False): if cleanup: shutil.rmtree(log_dir, ignore_errors=True) np.random.seed(42) tf.set_random_seed(7) env = make_env(666) observation_space = env.observation_space action_space = env.action_space env.close() actor_queue = tf.FIFOQueue(capacity=config.queue_capacity, dtypes=[tf.uint8, tf.int32, tf.float32, tf.uint8, tf.float32, tf.int32], shapes=[observation_space.shape, action_space.shape, [], observation_space.shape, [], []]) batch_shape = [config.batch_size] learner_queue = StagingArea( dtypes=[tf.uint8, tf.int32, tf.float32, tf.uint8, tf.float32], shapes=[ batch_shape + list(observation_space.shape), batch_shape + list(action_space.shape), batch_shape, batch_shape + list(observation_space.shape), batch_shape], memory_limit=2**30) coord = tf.train.Coordinator() workers = [] learner = Learner( learner_dir(log_dir), observation_space, action_space, model, learner_queue, config, create_learner_logger(log_dir)) trainer = Trainer(config, actor_queue, learner_queue, observation_space, action_space, create_trainer_logger(log_dir)) workers.append(trainer) for i in range(config.actor_count): workers.append(Actor( i, i == 0, make_env(i), model, actor_queue, config, create_actor_logger(log_dir, i), create_json_logger(os.path.join(actor_dir(log_dir, i), 'episodes')), should_render=False,)) with U.make_session(config.tf_thread_count) as session: U.initialize(session=session) learner.load(learner_dir(log_dir), session=session); threads = [] for worker in workers: worker_fn = lambda: worker.run(session, coord) thread = threading.Thread(target=worker_fn) thread.start() threads.append(thread) learner.run(session, coord) actor_queue.close() try: coord.join(threads, stop_grace_period_secs=10) except RuntimeError as e: print("Failed to join threads: {}".format(e))
def __init__(self, use_aux_tasks, input_dims, image_input_shapes, buffer_size, hidden, layers, dim_latent_repr, cnn_nonlinear, use_bottleneck_layer, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, scope, T, rollout_batch_size, clip_pos_returns, clip_return, log_loss, sample_transitions, gamma, rank, serialized=False, reuse=False, clip_grad_range=None, aux_filter_interval=None, scale_grad_by_procs=False, aux_update_interval=5, aux_base_lr=5, **kwargs): """ See the documentation in main.py """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function( 'cnn_actor_critic:CNNActorCritic') input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] if self.use_aux_tasks: self.dim_bw_frame = self.input_dims['info_bw_frame'] self.dim_op_flow = self.input_dims['info_op_flow'] self.dim_transformed_frame = self.input_dims[ 'info_transformed_frame'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() include_info = [ 'info_state_obs', 'info_transformed_frame', 'info_transformation', 'info_op_flow', 'info_bw_frame' ] for key in sorted(self.input_dims.keys()): if key.startswith('info_') and not key in include_info: continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): if self.use_aux_tasks: # Initialize OL-AUX self.num_auxiliary_tasks = 5 self.aux_weights_lr = self.aux_base_lr * self.aux_update_interval self.aux_weight_vector_Q_tf = tf.Variable( initial_value=1 * tf.ones(self.num_auxiliary_tasks), dtype=tf.float32, name='aux_weights') self.aux_weight_grads_buffer = [] self.log_aux_losses_Q = self.log_aux_tasks_losses_pi = None # Logging buffer for aux losses if self.aux_filter_interval is not None: self.all_grad_history = deque( maxlen=self.aux_filter_interval) self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=self.reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' and not key.startswith('info_') else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ self.input_dims = input_dims self.buffer_size = buffer_size self.hidden = hidden self.layers = layers self.network_class = network_class self.polyak = polyak self.batch_size = batch_size self.Q_lr = Q_lr self.pi_lr = pi_lr self.norm_eps = norm_eps self.norm_clip = norm_clip self.max_u = max_u self.action_l2 = action_l2 self.clip_obs = clip_obs self.scope = scope self.T = T self.subtract_goals = subtract_goals self.relative_goals = relative_goals self.clip_pos_returns = clip_pos_returns if clip_return is None: self.clip_return = np.inf else: self.clip_return = clip_return self.bc_loss = bc_loss self.q_filter = q_filter self.num_demo = num_demo self.demo_batch_size = demo_batch_size self.prm_loss_weight = prm_loss_weight self.aux_loss_weight = aux_loss_weight self.sample_transitions = sample_transitions self.gamma = gamma self.kwargs = kwargs self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['ag'] = (self.T, self.dimg) self.buffer = ReplayBuffer(buffer_shapes, self.buffer_size, self.T, self.sample_transitions) global DEMO_BUFFER # initialize the demo buffer; in the same way as the primary data buffer DEMO_BUFFER = ReplayBuffer(buffer_shapes, self.buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'GHER.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ # # print("\n\n\n\n1--", input_dims, "\n2--", buffer_size, "\n3--", hidden, # "\n4--", layers, "\n5--", network_class, "\n6--", polyak, "\n7--", batch_size, # "\n8--", Q_lr, "\n9--", pi_lr, "\n10--", norm_eps, "\n11--", norm_clip, # "\n12--", max_u, "\n13--", action_l2, "\n14--", clip_obs, "\n15--", scope, "\n16--", T, # "\n17--", rollout_batch_size, "\n18--", subtract_goals, "\n19--", relative_goals, # "\n20--", clip_pos_returns, "\n21--", clip_return, # "\n22--", sample_transitions, "\n23--", gamma) """ Example of parameter values in the FetchReach-v1 run: Input_dims (dict of ints): {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1} (o, u, g are both input to the network) Buffer_size (int): 1E6 (total number of experience pool samples) Hidden (int): 256 (number of hidden layer neurons) Layers (int): 3 (three-layer neural network) Network_class (str): GHER.ActorCritic' Polyak (float): 0.95 (smooth parameter updated by target-Network) Batch_size (int): 256 (bulk size) Q_lr (float): 0.001 (learning rate) Pi_lr (float): 0.001 (learning rate) Norm_eps (float): 0.01 (to avoid data overflow) Norm_clip (float): 5 (norm_clip) Max_u (float): 1.0 (the range of the action is [-1.0, 1.0]) Action_l2 (float): 1.0 (loss coefficient of the actor network) Clip_obs (float): 200 (obs is limited to (-200, +200)) Scope (str): "ddpg" (scope named field used by tensorflow) T (int): 50 (the number of cycles of interaction) Rollout_batch_size (int): 2 (number of parallel rollouts per DDPG agent) Subtract_goals (function): A function that preprocesses the goal, with inputs a and b, and output a-b Relative_goals (boolean): False (true if the need for function subtract_goals processing for the goal) Clip_pos_returns (boolean): True (Do you need to eliminate the positive return) Clip_return (float): 50 (limit the range of return to [-clip_return, clip_return]) Sample_transitions (function): The function returned by her. The parameters are defined by config.py Gamma (float): 0.98 (the discount factor used when Q network update) Where sample_transition comes from the definition of HER and is a key part """ if self.clip_return is None: self.clip_return = np.inf # The creation of the network structure and calculation graph is done by the actor_critic.py file self.create_actor_critic = import_function(self.network_class) # Extract dimension input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] # 10 self.dimg = self.input_dims['g'] # 4 self.dimu = self.input_dims['u'] # 3 # print("+++", input_shapes) # {'o': (10,), 'u': (4,), 'g': (3,), 'info_is_success': (1,)} # https://www.tensorflow.org/performance/performance_models # StagingArea provides simpler functionality and can be executed in parallel with other phases in the CPU and GPU. # Split the input pipeline into 3 separate parallel operations, and this is scalable to take advantage of large multi-core environments # Define the required storage variable. Suppose self.dimo=10, self.dimg=5, self.dimu=5 # Then state_shapes={'o':(None, 10), 'g':(None, 5), 'u':(None:5)} # Add the variable used by the target network at the same time state_shapes={'o_2':(None, 10), 'g_2': (None, 5)} # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) # Reward for scalar self.stage_shapes = stage_shapes # After executing self.stage_shapes = # OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10) ), ('g_2', (None, 3)), ('r', (None,))]) # Including g, o, u, target used in o_2, g_2 and reward r # Create network. # Create tf variables based on state_shape, including g, o, u, o_2, g_2, r # self.buffer_ph_tf = [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>] with tf.variable_scope(self.scope): # Create a StagingArea variable self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) # Create a Tensorflow variable placeholder self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] # Correspond to the tensorflow variable and the StagingArea variable self.stage_op = self.staging_tf.put(self.buffer_ph_tf) # self._create_network(reuse=reuse) # Experience pool related operations # When T = 50, after execution, buffer_shapes= # {'o': (51, 10), 'u': (50, 4), 'g': (50, 3), 'info_is_success': (50, 1), 'ag': (51, 3)} # Note that a, g, u all record all the samples experienced in a cycle, so it is 50 dimensions, but o and ag need 1 more? ? ? ? buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } # buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) # buffer_shapes['ag'] = (self.T + 1, self.dimg) # # print("+++", buffer_shapes) # buffer_size Is the length counted by sample # self.buffer_size=1E6 self.rollout_batch_size=2 buffer_size=1E6 buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, action_scale, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, temperature, prioritization, env_name, alpha, beta0, beta_iters, total_timesteps, rank_method, reuse=False, **kwargs): """ Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: :param input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) :param buffer_size (int): number of transitions that are stored in the replay buffer :param hidden (int): number of units in the hidden layers :param layers (int): number of hidden layers :param network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') :param polyak (float): coefficient for Polyak-averaging of the target network :param batch_size (int): batch size for training :param Q_lr (float): learning rate for the Q (critic) network :param pi_lr (float): learning rate for the pi (actor) network :param norm_eps (float): a small value used in the normalizer to avoid numerical instabilities :param norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] :param action_scale(float): maximum action magnitude, i.e. actions are in [-max_u, max_u] :param action_l2 (float): coefficient for L2 penalty on the actions :param clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] :param scope (str): the scope used for the TensorFlow graph :param T (int): the time horizon for rollouts :param rollout_batch_size (int): number of parallel rollouts per DDPG agent :param subtract_goals (function): function that subtracts goals from each other :param relative_goals (boolean): whether or not relative goals should be fed into the network :param clip_pos_returns (boolean): whether or not positive returns should be clipped :param clip_return (float): clip returns to be in [-clip_return, clip_return] :param sample_transitions (function) function that samples from the replay buffer :param gamma (float): gamma used for Q learning updates :param reuse (boolean): whether or not the networks should be reused :param bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss :param q_filter: whether or not a filter on the q value update should be used when training with demonstartions :param num_demo: Number of episodes in to be used in the demonstration buffer :param demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread :param prm_loss_weight: Weight corresponding to the primary loss :param aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function( self.network_class) # points to actor_critic.py self.input_dims = input_dims input_shapes = dims_to_shapes(input_dims) self.dimo = input_dims['o'] self.dimg = input_dims['g'] self.dimu = input_dims['u'] self.sample_count = 1 self.cycle_count = 1 self.critic_loss_episode = [] self.actor_loss_episode = [] self.critic_loss_avg = [] self.actor_loss_avg = [] # Energy based parameters self.prioritization = prioritization self.env_name = env_name self.temperature = temperature self.rank_method = rank_method # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Creates DDPG agent # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size # print("begin init") if self.prioritization == 'energy': self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, self.prioritization, self.env_name) # elif self.prioritization == 'tderror': # self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha) # if beta_iters is None: # beta_iters = total_timesteps # self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) else: self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False, env=None, to_goal=None, nearby_action_penalty=False, nearby_penalty_weight=0, sample_expert=False, expert_batch_size=0., bc_loss=0., anneal_bc=0., terminate_bootstrapping=False, mask_q = False, two_qs=False, anneal_discriminator=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) if two_qs: stage_shapes['r2'] = (None,) stage_shapes['w_q2'] = (None, ) stage_shapes['successes'] = (None,) if nearby_action_penalty: stage_shapes['far_from_goal'] = (None, ) if sample_expert: stage_shapes['is_demo'] = (None, ) stage_shapes['annealing_factor'] = (None, ) self.stage_shapes = stage_shapes # Create network. # print(self.stage_shapes.keys()) with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_shapes['successes'] = (self.T,) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) self.expert_buffer = None self.all_variables = self._global_vars('') if to_goal is None: print("to goal is none!") self.to_goal = (0, 2) else: self.to_goal = to_goal self.to_goal_func = (lambda x: x[self.to_goal[0] : self.to_goal[1]]) if len(self.to_goal) == 2 else (lambda x: x[np.array(self.to_goal)]) self.nearby_action_penalty = nearby_action_penalty self.nearby_penalty_weight = nearby_penalty_weight
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, seed, start_timesteps, eval_freq, max_timesteps, expl_noise, hrl_batch_size, discount, tau, policy_noise, noise_clip, policy_freq, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) global demoBuffer demoBuffer = ReplayBuffer( buffer_shapes, buffer_size, self.T, self.sample_transitions ) #initialize the demo buffer; in the same way as the primary data buffer ################# hrl ############### ############### # BUILD MODEL # ############### #self.num_goal = num_goal #self.num_action = num_action #self.batch_size = batch_size #state_dim = 6 #action_dim = 6 #max_action = float(env.action_space.high[0]) # Construct meta-controller and controller ''' self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype)self.dimo ''' #self.meta_controller = TD3.TD3(state_dim, action_dim, max_action) #self.meta_controller = TD3(self.dimo, self.dimo, max_u) self.meta_controller = TD3(self.dimo + self.dimg, self.dimo, self.clip_obs) #self.controller = TD3.TD3(state_dim, action_dim, max_action) self.controller = TD3(2 * self.dimo, self.dimu, max_u) #self.meta_replay_memory = ReplayBuffer() #self.ctrl_replay_memory = ReplayBuffer() self.low_replay_buffer = H_ReplayBuffer() self.high_replay_buffer = H_ReplayBuffer() self.clip_obs2 = 5
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, gg_k, replay_strategy, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) self.replay_strategy = replay_strategy self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K: self.max_g = kwargs['max_g'] self.d0 = kwargs['d0'] self.slope = kwargs['slope'] self.goal_lr = kwargs['goal_lr'] # reward shaping parameters self.rshape_lambda = kwargs['rshape_lambda'] self.reshape_p = kwargs['rshape_p'] self.rshaping = kwargs['rshaping'] self.input_dims['e'] = self.dimg * self.T self.input_dims['mask'] = self.T self.dime = self.input_dims['e'] self.dim_mask = self.input_dims['mask'] input_shapes = dims_to_shapes(self.input_dims) # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) if self.replay_strategy in [ C.REPLAY_STRATEGY_BEST_K, C.REPLAY_STRATEGY_GEN_K, C.REPLAY_STRATEGY_GEN_K_GMM ]: buffer_shapes['gg'] = (self.T, self.gg_k, self.dimg) if self.replay_strategy in [ C.REPLAY_STRATEGY_BEST_K, C.REPLAY_STRATEGY_GEN_K_GMM ]: buffer_shapes['gg_idx'] = (self.T, self.gg_k) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, replay_k, reward_fun=None, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf # Create the actor critic networks. network_class is defined in actor_critic.py # This class is assigned to network_class when DDPG objest is created self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) # Next state (o_2) and goal at next state (g_2) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) self.stage_shapes = stage_shapes # Adding variable for correcting bias - Ameet self.stage_shapes_new = OrderedDict() self.stage_shapes_new['bias'] = (None,) ############################################## # Create network # Staging area is a datatype in tf to input data into GPUs with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) # Adding bias term from section 3.4 - Ameet self.staging_tf_new = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes_new.keys()], shapes=list(self.stage_shapes_new.values())) self.buffer_ph_tf_new = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes_new.values()] self.stage_op_new = self.staging_tf_new.put(self.buffer_ph_tf_new) ############################################ self._create_network(reuse=reuse) # Configure the replay buffer buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size # conf represents the parameters required for initializing the priority_queue # Remember: The bias gets annealed only conf.total_steps number of times conf = {'size': self.buffer_size, 'learn_start': self.batch_size, 'batch_size': self.batch_size, # Using some heuristic to set the partition_num as it matters only when the buffer is not full (unlikely) 'partition_size': (self.replay_k)*100} self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, conf, self.replay_k) # global_steps represents the number of batches used for updates self.global_step = 0 self.debug = {}
def __init__(self, *, input_dims, size_ensemble, use_Q, use_double_network, buffer_size, hidden, layers, batch_size, lr, norm_eps, norm_clip, polyak, max_u, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of value function ensemble. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer size_ensemble (int): number of value functions in the ensemble hidden (int): number of units in the hidden layers layers (int): number of hidden layers batch_size (int): batch size for training lr (float): learning rate for the Q (critic) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped in Bellman update inference_clip_pos_returns (boolean): whether or not output of the value output used for disagreement should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.use_double_network: self.use_Q = True self.create_v_function = DoubleQFunction elif self.use_Q: self.create_v_function = QFunction else: self.create_v_function = VFunction if self.clip_return is None: self.clip_return = np.inf # self.inference_clip_range = (-self.clip_return, 0. if inference_clip_pos_returns else self.clip_return) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] if self.use_Q: stage_shapes['u_2'] = stage_shapes['u'] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = [None] * self.size_ensemble self.stage_ops = [None] * self.size_ensemble self.buffer_ph_tf = [] for e in range(self.size_ensemble): staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] stage_op = staging_tf.put(buffer_ph_tf) # store in attribute list self.staging_tf[e] = staging_tf self.buffer_ph_tf.extend(buffer_ph_tf) self.stage_ops[e] = stage_op if self.use_double_network: self._create_double_network(reuse=reuse) else: self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['ag'] = (self.T, self.dimg) # if self.use_Q: # buffer_shapes['u_2'] = (self.T-1, self.dimu) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'GHER.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ # # print("\n\n\n\n1--", input_dims, "\n2--", buffer_size, "\n3--", hidden, # "\n4--", layers, "\n5--", network_class, "\n6--", polyak, "\n7--", batch_size, # "\n8--", Q_lr, "\n9--", pi_lr, "\n10--", norm_eps, "\n11--", norm_clip, # "\n12--", max_u, "\n13--", action_l2, "\n14--", clip_obs, "\n15--", scope, "\n16--", T, # "\n17--", rollout_batch_size, "\n18--", subtract_goals, "\n19--", relative_goals, # "\n20--", clip_pos_returns, "\n21--", clip_return, # "\n22--", sample_transitions, "\n23--", gamma) """ 在FetchReach-v1运行中参数值示例: input_dims (dict of ints): {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1} (o,u,g均作为网络的输入) buffer_size (int): 1E6 (经验池样本总数) hidden (int): 256 (隐含层神经元个数) layers (int): 3 (三层神经网络) network_class (str): GHER.ActorCritic' polyak (float): 0.95 (target-Network更新的平滑的参数) batch_size (int): 256 (批量大小) Q_lr (float): 0.001 (学习率) pi_lr (float): 0.001 (学习率) norm_eps (float): 0.01 (为避免数据溢出使用) norm_clip (float): 5 (norm_clip) max_u (float): 1.0 (动作的范围是[-1.0, 1.0]) action_l2 (float): 1.0 (Actor网络的损失正则项系数) clip_obs (float): 200 (obs限制在 (-200, +200)) scope (str): "ddpg" (tensorflow 使用的 scope 命名域) T (int): 50 (周期的交互次数) rollout_batch_size (int): 2 (number of parallel rollouts per DDPG agent) subtract_goals (function): 对goal进行预处理的函数, 输入为a和b,输出a-b relative_goals (boolean): False (如果需要对goal进行函数subtract_goals处理,则为True) clip_pos_returns (boolean): True (是否需要将正的return消除) clip_return (float): 50 (将return的范围限制在[-clip_return, clip_return]) sample_transitions (function): her返回的函数. 参数由 config.py 定义 gamma (float): 0.98 (Q 网络更新时使用的折扣因子) 其中 sample_transition 来自与 HER 的定义,是关键部分 """ if self.clip_return is None: self.clip_return = np.inf # 网络结构和计算图的创建由 actor_critic.py 文件完成 self.create_actor_critic = import_function(self.network_class) # 提取维度 input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] # 10 self.dimg = self.input_dims['g'] # 4 self.dimu = self.input_dims['u'] # 3 # print("+++", input_shapes) # {'o': (10,), 'u': (4,), 'g': (3,), 'info_is_success': (1,)} # https://www.tensorflow.org/performance/performance_models # StagingArea 提供了更简单的功能且可在 CPU 和 GPU 中与其他阶段并行执行。 # 将输入管道拆分为 3 个独立并行操作的阶段,并且这是可扩展的,充分利用大型的多核环境 # 定义需要的存储变量. 假设 self.dimo=10, self.dimg=5, self.dimu=5 # 则 state_shapes={'o':(None, 10), 'g':(None, 5), 'u':(None:5)} # 同时添加target网络使用的变量 state_shapes={'o_2':(None, 10), 'g_2': (None, 5)} # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) # 奖励为标量 self.stage_shapes = stage_shapes # 执行后 self.stage_shapes = # OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10)), ('g_2', (None, 3)), ('r', (None,))]) # 其中包括 g, o, u、target网络中使用的 o_2, g_2 和奖励 r # Create network. # 根据 state_shape 创建 tf 变量,其中包括 g, o, u, o_2, g_2, r # self.buffer_ph_tf = [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>, # <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>] with tf.variable_scope(self.scope): # 创建 StagingArea 变量 self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) # 创建 Tensorflow 变量 placeholder self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] # 将 tensorflow 变量与 StagingArea 变量相互对应 self.stage_op = self.staging_tf.put(self.buffer_ph_tf) # self._create_network(reuse=reuse) # 经验池相关操作 # 当T = 50时,执行结束后 buffer_shapes= # {'o': (51, 10), 'u': (50, 4), 'g': (50, 3), 'info_is_success': (50, 1), 'ag': (51, 3)} # 注意 a,g,u 均记录一个周期内经历的所有样本,因此为 50 维,但 o 和 ag 需要多1维 ???? buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } # buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) # buffer_shapes['ag'] = (self.T + 1, self.dimg) # # print("+++", buffer_shapes) # buffer_size 是按照样本进行计数的长度 # self.buffer_size=1E6 self.rollout_batch_size=2 buffer_size=1E6 buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, buffer, input_dims, hidden, layers, polyak, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, subtract_goals, relative_goals, clip_pos_returns, clip_return, gamma, vloss_type='normal', priority=False, reuse=False, **kwargs): """ buffer (object): buffer to save transitions input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) hidden (int): number of units in the hidden layers layers (int): number of hidden layers polyak (float): coefficient for Polyak-averaging of the target network Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] gamma (float): gamma used for Q learning updates vloss_type (str): value loss type, 'normal', 'tf_gamma', 'target' priority(boolean): use priority or not reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.dimo, self.dimg, self.dimu = self.input_dims[ 'o'], self.input_dims['g'], self.input_dims['u'] self.stage_shapes = self.get_stage_shapes() self.init_target_net_op = None self.update_target_net_op = None # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) logger.log('value loss type: {}'.format(self.vloss_type))
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, reuse=False, use_seperate_networks=False, **kwargs): if self.clip_return is None: self.clip_return = np.inf if use_seperate_networks: self.create_naf_network = import_function( "her.naf_utils.naf_network_seperate:Network") else: self.create_naf_network = import_function( "her.naf_utils.naf_network_shared:Network") input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.counter = 0 # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) global DEMO_BUFFER DEMO_BUFFER = ReplayBuffer( buffer_shapes, buffer_size, self.T, self.sample_transitions ) #initialize the demo buffer; in the same way as the primary data buffer
def __init__(self, input_dims, buffer_size, hidden, layers, network_class_actor_critic, network_class_discriminator, polyak, batch_size, Q_lr, pi_lr, mi_lr, sk_lr, r_scale, mi_r_scale, sk_r_scale, et_r_scale, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, env_name, max_timesteps, pretrain_weights, finetune_pi, mi_prioritization, sac, reuse=False, history_len=10000, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function( self.network_class_actor_critic) self.create_discriminator = import_function( self.network_class_discriminator) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimz = self.input_dims['z'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.env_name = env_name # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) stage_shapes['w'] = (None, ) stage_shapes['m'] = (None, ) stage_shapes['s'] = (None, ) stage_shapes['m_w'] = () stage_shapes['s_w'] = () stage_shapes['r_w'] = () stage_shapes['e_w'] = () self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(pretrain_weights, mi_prioritization, reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, mi_prioritization) self.mi_r_history = deque(maxlen=history_len) self.gl_r_history = deque(maxlen=history_len) self.sk_r_history = deque(maxlen=history_len) self.et_r_history = deque(maxlen=history_len) self.mi_current = 0 self.finetune_pi = finetune_pi
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, time_horizon, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False): """ Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u) :param buffer_size: (int) number of transitions that are stored in the replay buffer :param hidden: (int) number of units in the hidden layers :param layers: (int) number of hidden layers :param network_class: (str) the network class that should be used (e.g. 'baselines.her.ActorCritic') :param polyak: (float) coefficient for Polyak-averaging of the target network :param batch_size: (int) batch size for training :param q_lr: (float) learning rate for the Q (critic) network :param pi_lr: (float) learning rate for the pi (actor) network :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip] :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u] :param action_l2: (float) coefficient for L2 penalty on the actions :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs] :param scope: (str) the scope used for the TensorFlow graph :param time_horizon: (int) the time horizon for rollouts :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals from each other :param relative_goals: (boolean) whether or not relative goals should be fed into the network :param clip_pos_returns: (boolean) whether or not positive returns should be clipped :param clip_return: (float) clip returns to be in [-clip_return, clip_return] :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer :param gamma: (float) gamma used for Q learning updates :param reuse: (boolean) whether or not the networks should be reused """ # Updated in experiments/config.py self.input_dims = input_dims self.buffer_size = buffer_size self.hidden = hidden self.layers = layers self.network_class = network_class self.polyak = polyak self.batch_size = batch_size self.q_lr = q_lr self.pi_lr = pi_lr self.norm_eps = norm_eps self.norm_clip = norm_clip self.max_u = max_u self.action_l2 = action_l2 self.clip_obs = clip_obs self.scope = scope self.time_horizon = time_horizon self.rollout_batch_size = rollout_batch_size self.subtract_goals = subtract_goals self.relative_goals = relative_goals self.clip_pos_returns = clip_pos_returns self.clip_return = clip_return self.sample_transitions = sample_transitions self.gamma = gamma self.reuse = reuse if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dim_obs = self.input_dims['o'] self.dim_goal = self.input_dims['g'] self.dim_action = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.time_horizon if key != 'o' else self.time_horizon + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal) buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.time_horizon, self.sample_transitions)
def __init__(self, input_dims, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, normalize_obs, sample_transitions, gamma, buffers=None, reuse=False, tasks_ag_id=None, tasks_g_id=None, task_replay='', t_id=None, eps_task=None, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused, buffers (list): buffers to be used to store new transition (usually one per task + 1 task_ag_id (list): indices to find achieved goals for each task in the achieved goal vector task_g_id (list): indices to find agoals for each task in the goal vector task_replay (str): defines the task replay strategy (see train.py for info) t_id (int): index of the task corresponding to this policy when using a task-experts structure eps_task (float): epsilon parameter for the epsilon greedy strategy (task choice) """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) self.normalize_obs = normalize_obs input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimag = self.input_dims['ag'] self.dimu = self.input_dims['u'] if self.structure == 'curious' or self.structure == 'task_experts': self.dimtd = self.input_dims['task_descr'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, 1) self.stage_shapes = stage_shapes if t_id is not None: self.scope += str(t_id) # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # addition for multi-task structures if self.structure == 'curious' or self.structure == 'task_experts': self.tasks_g_id = tasks_g_id self.tasks_ag_id = tasks_ag_id self.nb_tasks = len(tasks_g_id) if buffers is not None: self.buffer = buffers if type(self.buffer) is list: if len(self.buffer) > 5: # distractor buffers are equal for i in range(6, len(self.buffer)): self.buffer[i] = self.buffer[5] self.first = True
def __init__(self, FLAGS, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, # sample_transitions, gamma, reuse=False, **kwargs): sample_transitions, gamma, td3_policy_freq, td3_policy_noise, td3_noise_clip, reuse=False, *agent_params, **kwargs): ## """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss agent_params: for HAC agent params """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] # self.dimo1= self.input_dims['o1'] ##A.R add for TD3 (has obs0, obs1) self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] #추가된 내용 #parameters for using TD3 variant of DDPG #https://arxiv.org/abs/1802.09477 self.td3_policy_freq = td3_policy_freq self.td3_policy_noise = td3_policy_noise self.td3_noise_clip = td3_noise_clip ## for HAC self.FLAGS = FLAGS # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: # for key in ['o', 'o1', 'g']: #o1 added by A.R stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) # origin : buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) # buffer_shapes = {key: (self.T-1 if key != 'o' and key != 'o1' else self.T, *input_shapes[key]) #A.Rㅇ for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) global DEMO_BUFFER DEMO_BUFFER = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer print("@ ddgp.py , buffer={}".format(self.buffer))
def __init__(self, env_spec, task_spec, buffer_size, network_params, normalizer_params, polyak, batch_size, Q_lr, pi_lr, max_u, action_l2, clip_obs, scope, random_eps, noise_eps, train_steps, relative_goals, clip_pos_returns, clip_return, replay_strategy, replay_k, noise_type, share_experience, noise_adaptation, reuse=False): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ super().__init__(scope) self.replay_k = replay_k self.replay_strategy = replay_strategy self.clip_pos_returns = clip_pos_returns self.relative_goals = relative_goals self.train_steps = train_steps self.noise_eps = noise_eps self.random_eps = random_eps self.clip_obs = clip_obs self.action_l2 = action_l2 self.max_u = max_u self.pi_lr = pi_lr self.Q_lr = Q_lr self.batch_size = batch_size self.normalizer_params = normalizer_params self.polyak = polyak self.buffer_size = buffer_size self._env_spec = env_spec self._T = self._env_spec['T'] self._task_spec = task_spec self.network_params = network_params self._share_experience = share_experience self._noise_adaptation = noise_adaptation self._task_spec = deepcopy(task_spec) self._task_spec['buffer_size'] = 0 self._task = Task(**self._task_spec) self._gamma = 1. - 1. / self._T self.clip_return = (1. / (1. - self._gamma)) if clip_return else np.inf if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(network_params['net_type']) self.input_dims = dict( o=self._env_spec['o_dim'], a=self._env_spec['a_dim'], g=self._task_spec['g_dim'], ) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self._env_spec['o_dim'] self.dimg = self._task_spec['g_dim'] self.dima = self._env_spec['a_dim'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_next'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes self._action_noise, self._parameter_noise = get_noise_from_string( self._env_spec, noise_type) # Create network. with tf.variable_scope(self._scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) buffer_shapes = dict() buffer_shapes['o'] = (self.dimo, ) buffer_shapes['o_next'] = buffer_shapes['o'] buffer_shapes['g'] = (self.dimg, ) buffer_shapes['ag'] = (self.dimg, ) buffer_shapes['ag_next'] = (self.dimg, ) buffer_shapes['a'] = (self.dima, ) self.sample_transitions = make_sample_her_transitions( self.replay_strategy, self.replay_k, self._task.reward_done_success) self._buffer = ReplayBuffer(buffer_shapes, self.buffer_size, self._T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, reuse=False, pre_train_model=False, update_model=True, feature_net_path='', **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxiliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxiliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf # ADDED self.use_contact = (self.contact_dim > 0) self.pre_train_model = pre_train_model self.feature_net_path = feature_net_path self.process_type = kwargs['process_type'] self.contact_dim = kwargs['contact_dim'] self.__dict__['use_contact'] = self.use_contact self.__dict__['pre_train'] = self.pre_train_model self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] - self.contact_dim self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.feature_dim = kwargs['feature_dim'] self.contact_point_dim = self.contact_dim // self.fixed_num_of_contact # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # order: ['g', 'o', 'u', 'o_2', 'g_2', 'r']) if self.pre_train_model == 'cpc': self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self.cpc_shape = OrderedDict() self.cpc_shape['obs_neg'] = (None, self.fixed_num_of_contact, self.contact_point_dim) self.cpc_shape['obs_pos'] = (None, self.fixed_num_of_contact, self.contact_point_dim) self.cpc_staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.cpc_shape.keys()], shapes=list(self.cpc_shape.values())) self.cpc_buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.cpc_shape.values() ] self.cpc_stage_op = self.cpc_staging_tf.put( self.cpc_buffer_ph_tf) else: self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self.update_model = update_model if self.pre_train_model != 'none': self.__dict__['feature_net_path'] = self.feature_net_path self.__dict__['clip_obs'] = self.clip_obs self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ self.ep_ctr = 0 self.hist_bins = 50 self.draw_hist_freq = 3 self._reset_hists() self.shared_pi_err_coeff = kwargs['shared_pi_err_coeff'] HRL_Policy.__init__(self, input_dims, T, rollout_batch_size, **kwargs) self.hidden = hidden self.layers = layers self.max_u = max_u self.network_class = network_class self.sample_transitions = sample_transitions self.scope = scope self.subtract_goals = subtract_goals self.relative_goals = relative_goals self.clip_obs = clip_obs self.Q_lr = Q_lr self.pi_lr = pi_lr self.batch_size = batch_size self.buffer_size = buffer_size self.clip_pos_returns = clip_pos_returns self.gamma = gamma self.polyak = polyak self.clip_return = clip_return self.norm_eps = norm_eps self.norm_clip = norm_clip self.action_l2 = action_l2 if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) self.stage_shapes['gamma'] = (None, ) # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *self.input_shapes[key]) for key, val in self.input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_shapes['p'] = (buffer_shapes['g'][0], 1) buffer_shapes['steps'] = buffer_shapes['p'] buffer_size = self.buffer_size #// self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) self.preproc_lr = (self.Q_lr + self.pi_lr) / 2