示例#1
0
 def setup_staging_areas(self):
     for idx, device in enumerate(self._devices):
         with tf.device(device):
             inputs = self._input.get_input_tensors()
             dtypes = [x.dtype for x in inputs]
             stage = StagingArea(dtypes, shapes=None)
             self._stage_ops.append(stage.put(inputs))
             self._areas.append(stage)
             outputs = stage.get()
             for vin, vout in zip(inputs, outputs):
                 vout.set_shape(vin.get_shape())
             self._unstage_ops.append(outputs)
示例#2
0
 def stage_data(self, batch, memory_gb=1, n_threads=4):
     ''''''
     with tf.device('/gpu:0'):
         dtypes = [t.dtype for t in batch]
         shapes = [t.get_shape() for t in batch]
         SA = StagingArea(dtypes,
                          shapes=shapes,
                          memory_limit=memory_gb * 1e9)
         get, put, clear = SA.get(), SA.put(batch), SA.clear()
     tf.train.add_queue_runner(
         tf.train.QueueRunner(queue=SA,
                              enqueue_ops=[put] * n_threads,
                              close_op=clear,
                              cancel_op=clear))
     return get
示例#3
0
 def setup_staging_areas(self):
     for idx, device in enumerate(self._devices):
         with tf.device(device):
             inputs = self._input.get_input_tensors()
             dtypes = [x.dtype for x in inputs]
             stage = StagingArea(dtypes, shapes=None)
             self._stage_ops.append(stage.put(inputs))
             self._areas.append(stage)
             outputs = stage.get()
             if isinstance(
                     outputs,
                     tf.Tensor):  # when size=1, TF doesn't return a list
                 outputs = [outputs]
             for vin, vout in zip(inputs, outputs):
                 vout.set_shape(vin.get_shape())
             self._unstage_ops.append(outputs)
示例#4
0
    def _prepare_staging(self):
        with tf.variable_scope('staging', reuse=tf.AUTO_REUSE):
            staging_area_tf = StagingArea(
                dtypes=[tf.float32 for _ in self._stage_shapes.keys()],
                shapes=[(None, *shape)
                        for shape in self._stage_shapes.values()])
            input_ph_tf = [
                tf.placeholder(tf.float32, shape=(None, *shape))
                for shape in self._stage_shapes.values()
            ]
            staging_op_tf = staging_area_tf.put(input_ph_tf)

            batch_tf = OrderedDict([
                (key, batch_item) for key, batch_item in zip(
                    self._stage_shapes.keys(), staging_area_tf.get())
            ])

        return staging_area_tf, input_ph_tf, staging_op_tf, batch_tf
示例#5
0
def train_qdqn(config, log_dir, make_env, model, cleanup=False):
    if cleanup:
        shutil.rmtree(log_dir, ignore_errors=True)

    np.random.seed(42)
    tf.set_random_seed(7)

    env = make_env(666)
    observation_space = env.observation_space
    action_space = env.action_space
    env.close()

    actor_queue = tf.FIFOQueue(capacity=config.queue_capacity,
            dtypes=[tf.uint8, tf.int32, tf.float32, tf.uint8, tf.float32, tf.int32],
            shapes=[observation_space.shape, action_space.shape, [], observation_space.shape, [], []])

    batch_shape = [config.batch_size]
    learner_queue = StagingArea(
            dtypes=[tf.uint8, tf.int32, tf.float32, tf.uint8, tf.float32],
            shapes=[
                batch_shape + list(observation_space.shape),
                batch_shape + list(action_space.shape),
                batch_shape,
                batch_shape + list(observation_space.shape),
                batch_shape],
            memory_limit=2**30)

    coord = tf.train.Coordinator()

    workers = []
    learner = Learner(
            learner_dir(log_dir),
            observation_space,
            action_space,
            model,
            learner_queue,
            config,
            create_learner_logger(log_dir))

    trainer = Trainer(config, actor_queue, learner_queue, observation_space, action_space,
            create_trainer_logger(log_dir))
    workers.append(trainer)

    for i in range(config.actor_count):
        workers.append(Actor(
            i,
            i == 0,
            make_env(i),
            model,
            actor_queue,
            config,
            create_actor_logger(log_dir, i),
            create_json_logger(os.path.join(actor_dir(log_dir, i), 'episodes')),
            should_render=False,))

    with U.make_session(config.tf_thread_count) as session:
        U.initialize(session=session)

        learner.load(learner_dir(log_dir), session=session);

        threads = []
        for worker in workers:
            worker_fn = lambda: worker.run(session, coord)
            thread = threading.Thread(target=worker_fn)
            thread.start()
            threads.append(thread)

        learner.run(session, coord)
        actor_queue.close()

        try:
            coord.join(threads, stop_grace_period_secs=10)
        except RuntimeError as e:
            print("Failed to join threads: {}".format(e))
示例#6
0
    def __init__(self,
                 use_aux_tasks,
                 input_dims,
                 image_input_shapes,
                 buffer_size,
                 hidden,
                 layers,
                 dim_latent_repr,
                 cnn_nonlinear,
                 use_bottleneck_layer,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 scope,
                 T,
                 rollout_batch_size,
                 clip_pos_returns,
                 clip_return,
                 log_loss,
                 sample_transitions,
                 gamma,
                 rank,
                 serialized=False,
                 reuse=False,
                 clip_grad_range=None,
                 aux_filter_interval=None,
                 scale_grad_by_procs=False,
                 aux_update_interval=5,
                 aux_base_lr=5,
                 **kwargs):
        """ See the documentation in main.py """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(
            'cnn_actor_critic:CNNActorCritic')

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        if self.use_aux_tasks:
            self.dim_bw_frame = self.input_dims['info_bw_frame']
            self.dim_op_flow = self.input_dims['info_op_flow']
            self.dim_transformed_frame = self.input_dims[
                'info_transformed_frame']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()

        include_info = [
            'info_state_obs', 'info_transformed_frame', 'info_transformation',
            'info_op_flow', 'info_bw_frame'
        ]

        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_') and not key in include_info:
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            if self.use_aux_tasks:
                # Initialize OL-AUX
                self.num_auxiliary_tasks = 5
                self.aux_weights_lr = self.aux_base_lr * self.aux_update_interval

                self.aux_weight_vector_Q_tf = tf.Variable(
                    initial_value=1 * tf.ones(self.num_auxiliary_tasks),
                    dtype=tf.float32,
                    name='aux_weights')
                self.aux_weight_grads_buffer = []
                self.log_aux_losses_Q = self.log_aux_tasks_losses_pi = None  # Logging buffer for aux losses
                if self.aux_filter_interval is not None:
                    self.all_grad_history = deque(
                        maxlen=self.aux_filter_interval)

            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=self.reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key:
            (self.T if key != 'o' and not key.startswith('info_') else self.T +
             1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)
示例#7
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        self.input_dims = input_dims
        self.buffer_size = buffer_size
        self.hidden = hidden
        self.layers = layers
        self.network_class = network_class
        self.polyak = polyak
        self.batch_size = batch_size
        self.Q_lr = Q_lr
        self.pi_lr = pi_lr
        self.norm_eps = norm_eps
        self.norm_clip = norm_clip
        self.max_u = max_u
        self.action_l2 = action_l2
        self.clip_obs = clip_obs
        self.scope = scope
        self.T = T
        self.subtract_goals = subtract_goals
        self.relative_goals = relative_goals
        self.clip_pos_returns = clip_pos_returns
        if clip_return is None:
            self.clip_return = np.inf
        else:
            self.clip_return = clip_return
        self.bc_loss = bc_loss
        self.q_filter = q_filter
        self.num_demo = num_demo
        self.demo_batch_size = demo_batch_size
        self.prm_loss_weight = prm_loss_weight
        self.aux_loss_weight = aux_loss_weight
        self.sample_transitions = sample_transitions
        self.gamma = gamma
        self.kwargs = kwargs

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['ag'] = (self.T, self.dimg)

        self.buffer = ReplayBuffer(buffer_shapes, self.buffer_size, self.T,
                                   self.sample_transitions)

        global DEMO_BUFFER
        # initialize the demo buffer; in the same way as the primary data buffer
        DEMO_BUFFER = ReplayBuffer(buffer_shapes, self.buffer_size, self.T,
                                   self.sample_transitions)
示例#8
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'GHER.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """

        # # print("\n\n\n\n1--", input_dims, "\n2--", buffer_size, "\n3--", hidden,
        #         "\n4--", layers, "\n5--", network_class, "\n6--", polyak, "\n7--", batch_size,
        #          "\n8--", Q_lr, "\n9--", pi_lr, "\n10--", norm_eps, "\n11--", norm_clip,
        #          "\n12--", max_u, "\n13--", action_l2, "\n14--", clip_obs, "\n15--", scope, "\n16--", T,
        #          "\n17--", rollout_batch_size, "\n18--", subtract_goals, "\n19--", relative_goals,
        #          "\n20--", clip_pos_returns, "\n21--", clip_return,
        #          "\n22--", sample_transitions, "\n23--", gamma)
        """
         Example of parameter values ​​in the FetchReach-v1 run:
            Input_dims (dict of ints): {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1} (o, u, g are both input to the network)
            Buffer_size (int): 1E6 (total number of experience pool samples)
            Hidden (int): 256 (number of hidden layer neurons)
            Layers (int): 3 (three-layer neural network)
            Network_class (str): GHER.ActorCritic'
            Polyak (float): 0.95 (smooth parameter updated by target-Network)
            Batch_size (int): 256 (bulk size)
            Q_lr (float): 0.001 (learning rate)
            Pi_lr (float): 0.001 (learning rate)
            Norm_eps (float): 0.01 (to avoid data overflow)
            Norm_clip (float): 5 (norm_clip)
            Max_u (float): 1.0 (the range of the action is [-1.0, 1.0])
            Action_l2 (float): 1.0 (loss coefficient of the actor network)
            Clip_obs (float): 200 (obs is limited to (-200, +200))
            Scope (str): "ddpg" (scope named field used by tensorflow)
            T (int): 50 (the number of cycles of interaction)
            Rollout_batch_size (int): 2 (number of parallel rollouts per DDPG agent)
            Subtract_goals (function): A function that preprocesses the goal, with inputs a and b, and output a-b
            Relative_goals (boolean): False (true if the need for function subtract_goals processing for the goal)
            Clip_pos_returns (boolean): True (Do you need to eliminate the positive return)
            Clip_return (float): 50 (limit the range of return to [-clip_return, clip_return])
            Sample_transitions (function): The function returned by her. The parameters are defined by config.py
            Gamma (float): 0.98 (the discount factor used when Q network update)

            Where sample_transition comes from the definition of HER and is a key part
        """

        if self.clip_return is None:
            self.clip_return = np.inf

        # The creation of the network structure and calculation graph is done by the actor_critic.py file
        self.create_actor_critic = import_function(self.network_class)

        # Extract dimension
        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']  # 10
        self.dimg = self.input_dims['g']  # 4
        self.dimu = self.input_dims['u']  # 3
        # print("+++", input_shapes)    #  {'o': (10,), 'u': (4,), 'g': (3,), 'info_is_success': (1,)}

        # https://www.tensorflow.org/performance/performance_models
        # StagingArea provides simpler functionality and can be executed in parallel with other phases in the CPU and GPU.
        # Split the input pipeline into 3 separate parallel operations, and this is scalable to take advantage of large multi-core environments

        # Define the required storage variable. Suppose self.dimo=10, self.dimg=5, self.dimu=5
        # Then state_shapes={'o':(None, 10), 'g':(None, 5), 'u':(None:5)}
        # Add the variable used by the target network at the same time state_shapes={'o_2':(None, 10), 'g_2': (None, 5)}
        # Prepare staging area for feeding data to the model.

        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )  # Reward for scalar
        self.stage_shapes = stage_shapes
        # After executing self.stage_shapes =
        # OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10) ), ('g_2', (None, 3)), ('r', (None,))])
        # Including g, o, u, target used in o_2, g_2 and reward r
        # Create network.
        # Create tf variables based on state_shape, including g, o, u, o_2, g_2, r
        # self.buffer_ph_tf = [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>]
        with tf.variable_scope(self.scope):
            # Create a StagingArea variable
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            # Create a Tensorflow variable placeholder
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            # Correspond to the tensorflow variable and the StagingArea variable
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            #
            self._create_network(reuse=reuse)

        # Experience pool related operations
        # When T = 50, after execution, buffer_shapes=
        #         {'o': (51, 10), 'u': (50, 4), 'g': (50, 3), 'info_is_success': (50, 1), 'ag': (51, 3)}
        # Note that a, g, u all record all the samples experienced in a cycle, so it is 50 dimensions, but o and ag need 1 more? ? ? ?
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }  #
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)  #
        buffer_shapes['ag'] = (self.T + 1, self.dimg)  #
        # print("+++", buffer_shapes)

        # buffer_size Is the length counted by sample
        # self.buffer_size=1E6  self.rollout_batch_size=2 buffer_size=1E6
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 action_scale,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 temperature,
                 prioritization,
                 env_name,
                 alpha,
                 beta0,
                 beta_iters,
                 total_timesteps,
                 rank_method,
                 reuse=False,
                 **kwargs):
        """
            Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.
        Args:
            :param input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            :param buffer_size (int): number of transitions that are stored in the replay buffer
            :param hidden (int): number of units in the hidden layers
            :param layers (int): number of hidden layers
            :param network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            :param polyak (float): coefficient for Polyak-averaging of the target network
            :param batch_size (int): batch size for training
            :param Q_lr (float): learning rate for the Q (critic) network
            :param pi_lr (float): learning rate for the pi (actor) network
            :param norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            :param norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            :param action_scale(float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            :param action_l2 (float): coefficient for L2 penalty on the actions
            :param clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            :param scope (str): the scope used for the TensorFlow graph
            :param T (int): the time horizon for rollouts
            :param rollout_batch_size (int): number of parallel rollouts per DDPG agent
            :param subtract_goals (function): function that subtracts goals from each other
            :param relative_goals (boolean): whether or not relative goals should be fed into the network
            :param clip_pos_returns (boolean): whether or not positive returns should be clipped
            :param clip_return (float): clip returns to be in [-clip_return, clip_return]
            :param sample_transitions (function) function that samples from the replay buffer
            :param gamma (float): gamma used for Q learning updates
            :param reuse (boolean): whether or not the networks should be reused
            :param bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            :param q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            :param num_demo: Number of episodes in to be used in the demonstration buffer
            :param demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            :param prm_loss_weight: Weight corresponding to the primary loss
            :param aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(
            self.network_class)  # points to actor_critic.py

        self.input_dims = input_dims

        input_shapes = dims_to_shapes(input_dims)
        self.dimo = input_dims['o']
        self.dimg = input_dims['g']
        self.dimu = input_dims['u']

        self.sample_count = 1
        self.cycle_count = 1

        self.critic_loss_episode = []
        self.actor_loss_episode = []
        self.critic_loss_avg = []
        self.actor_loss_avg = []

        # Energy based parameters
        self.prioritization = prioritization
        self.env_name = env_name
        self.temperature = temperature
        self.rank_method = rank_method

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)  # Creates DDPG agent

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size

        # print("begin init")
        if self.prioritization == 'energy':
            self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size,
                                             self.T, self.sample_transitions,
                                             self.prioritization,
                                             self.env_name)
        # elif self.prioritization == 'tderror':
        #     self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha)
        #     if beta_iters is None:
        #         beta_iters = total_timesteps
        #     self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0)
        else:
            self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                       self.sample_transitions)
示例#10
0
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, reuse=False, env=None, to_goal=None, nearby_action_penalty=False, nearby_penalty_weight=0,
                 sample_expert=False, expert_batch_size=0., bc_loss=0., anneal_bc=0., terminate_bootstrapping=False, mask_q = False,
                 two_qs=False, anneal_discriminator=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf
        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        if two_qs:
            stage_shapes['r2'] = (None,)
            stage_shapes['w_q2'] = (None, )
        stage_shapes['successes'] = (None,)
        if nearby_action_penalty:
            stage_shapes['far_from_goal'] = (None, )
        if sample_expert:
            stage_shapes['is_demo'] = (None, )
            stage_shapes['annealing_factor'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        # print(self.stage_shapes.keys())
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)
        buffer_shapes['successes'] = (self.T,)


        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
        self.expert_buffer = None

        self.all_variables = self._global_vars('')

        if to_goal is None:
            print("to goal is none!")
            self.to_goal = (0, 2)
        else:
            self.to_goal = to_goal
        self.to_goal_func = (lambda x: x[self.to_goal[0] : self.to_goal[1]]) if len(self.to_goal) == 2 else (lambda x: x[np.array(self.to_goal)])
        self.nearby_action_penalty = nearby_action_penalty
        self.nearby_penalty_weight = nearby_penalty_weight
示例#11
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 seed,
                 start_timesteps,
                 eval_freq,
                 max_timesteps,
                 expl_noise,
                 hrl_batch_size,
                 discount,
                 tau,
                 policy_noise,
                 noise_clip,
                 policy_freq,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.
        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        global demoBuffer
        demoBuffer = ReplayBuffer(
            buffer_shapes, buffer_size, self.T, self.sample_transitions
        )  #initialize the demo buffer; in the same way as the primary data buffer

        ################# hrl ###############
        ###############
        # BUILD MODEL #
        ###############
        #self.num_goal = num_goal
        #self.num_action = num_action
        #self.batch_size = batch_size

        #state_dim = 6
        #action_dim = 6
        #max_action = float(env.action_space.high[0])

        # Construct meta-controller and controller
        '''
        self.meta_controller = MetaController().type(dtype)
        self.target_meta_controller = MetaController().type(dtype)
        self.controller = Controller().type(dtype)
        self.target_controller = Controller().type(dtype)self.dimo
        '''
        #self.meta_controller = TD3.TD3(state_dim, action_dim, max_action)
        #self.meta_controller = TD3(self.dimo, self.dimo, max_u)
        self.meta_controller = TD3(self.dimo + self.dimg, self.dimo,
                                   self.clip_obs)

        #self.controller = TD3.TD3(state_dim, action_dim, max_action)
        self.controller = TD3(2 * self.dimo, self.dimu, max_u)

        #self.meta_replay_memory = ReplayBuffer()
        #self.ctrl_replay_memory = ReplayBuffer()
        self.low_replay_buffer = H_ReplayBuffer()
        self.high_replay_buffer = H_ReplayBuffer()

        self.clip_obs2 = 5
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 gg_k,
                 replay_strategy,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)
        self.replay_strategy = replay_strategy

        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            self.max_g = kwargs['max_g']
            self.d0 = kwargs['d0']
            self.slope = kwargs['slope']
            self.goal_lr = kwargs['goal_lr']
            # reward shaping parameters
            self.rshape_lambda = kwargs['rshape_lambda']
            self.reshape_p = kwargs['rshape_p']
            self.rshaping = kwargs['rshaping']

            self.input_dims['e'] = self.dimg * self.T
            self.input_dims['mask'] = self.T
            self.dime = self.input_dims['e']
            self.dim_mask = self.input_dims['mask']

        input_shapes = dims_to_shapes(self.input_dims)

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)

        if self.replay_strategy in [
                C.REPLAY_STRATEGY_BEST_K, C.REPLAY_STRATEGY_GEN_K,
                C.REPLAY_STRATEGY_GEN_K_GMM
        ]:
            buffer_shapes['gg'] = (self.T, self.gg_k, self.dimg)

        if self.replay_strategy in [
                C.REPLAY_STRATEGY_BEST_K, C.REPLAY_STRATEGY_GEN_K_GMM
        ]:
            buffer_shapes['gg_idx'] = (self.T, self.gg_k)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)
示例#13
0
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, replay_k, reward_fun=None, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        # Create the actor critic networks. network_class is defined in actor_critic.py
        # This class is assigned to network_class when DDPG objest is created
        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        # Next state (o_2) and goal at next state (g_2)
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Adding variable for correcting bias - Ameet
        self.stage_shapes_new = OrderedDict()
        self.stage_shapes_new['bias'] = (None,)
        ##############################################

        # Create network
        # Staging area is a datatype in tf to input data into GPUs
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            
            # Adding bias term from section 3.4 - Ameet
            self.staging_tf_new = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes_new.keys()],
                shapes=list(self.stage_shapes_new.values()))
            self.buffer_ph_tf_new = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes_new.values()]
            self.stage_op_new = self.staging_tf_new.put(self.buffer_ph_tf_new)
            ############################################

            self._create_network(reuse=reuse)

        # Configure the replay buffer
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size

        # conf represents the parameters required for initializing the priority_queue
        # Remember: The bias gets annealed only conf.total_steps number of times
        conf = {'size': self.buffer_size,
                'learn_start': self.batch_size,
                'batch_size': self.batch_size,
                # Using some heuristic to set the partition_num as it matters only when the buffer is not full (unlikely)
                'partition_size': (self.replay_k)*100}

        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, conf, self.replay_k)

        # global_steps represents the number of batches used for updates
        self.global_step = 0
        self.debug = {}
示例#14
0
    def __init__(self,
                 *,
                 input_dims,
                 size_ensemble,
                 use_Q,
                 use_double_network,
                 buffer_size,
                 hidden,
                 layers,
                 batch_size,
                 lr,
                 norm_eps,
                 norm_clip,
                 polyak,
                 max_u,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of value function ensemble.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            size_ensemble (int): number of value functions in the ensemble
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            batch_size (int): batch size for training
            lr (float): learning rate for the Q (critic) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped in Bellman update
            inference_clip_pos_returns (boolean): whether or not output of the value output used for disagreement should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.use_double_network:
            self.use_Q = True
            self.create_v_function = DoubleQFunction
        elif self.use_Q:
            self.create_v_function = QFunction
        else:
            self.create_v_function = VFunction

        if self.clip_return is None:
            self.clip_return = np.inf
        # self.inference_clip_range = (-self.clip_return, 0. if inference_clip_pos_returns else self.clip_return)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        if self.use_Q:
            stage_shapes['u_2'] = stage_shapes['u']
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = [None] * self.size_ensemble
            self.stage_ops = [None] * self.size_ensemble
            self.buffer_ph_tf = []
            for e in range(self.size_ensemble):
                staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                    shapes=list(self.stage_shapes.values()))
                buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.stage_shapes.values()
                ]
                stage_op = staging_tf.put(buffer_ph_tf)

                # store in attribute list
                self.staging_tf[e] = staging_tf
                self.buffer_ph_tf.extend(buffer_ph_tf)
                self.stage_ops[e] = stage_op

            if self.use_double_network:
                self._create_double_network(reuse=reuse)
            else:
                self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['ag'] = (self.T, self.dimg)
        # if self.use_Q:
        #     buffer_shapes['u_2'] = (self.T-1, self.dimu)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)
示例#15
0
文件: ddpg.py 项目: poisonwine/GHER
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'GHER.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """

        # # print("\n\n\n\n1--", input_dims, "\n2--", buffer_size, "\n3--", hidden,
        #         "\n4--", layers, "\n5--", network_class, "\n6--", polyak, "\n7--", batch_size,
        #          "\n8--", Q_lr, "\n9--", pi_lr, "\n10--", norm_eps, "\n11--", norm_clip,
        #          "\n12--", max_u, "\n13--", action_l2, "\n14--", clip_obs, "\n15--", scope, "\n16--", T,
        #          "\n17--", rollout_batch_size, "\n18--", subtract_goals, "\n19--", relative_goals,
        #          "\n20--", clip_pos_returns, "\n21--", clip_return,
        #          "\n22--", sample_transitions, "\n23--", gamma)
        """
        在FetchReach-v1运行中参数值示例:
            input_dims (dict of ints):  {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1}  (o,u,g均作为网络的输入) 
            buffer_size (int):  1E6     (经验池样本总数)
            hidden (int): 256          (隐含层神经元个数)
            layers (int): 3            (三层神经网络)
            network_class (str):        GHER.ActorCritic'
            polyak (float): 0.95       (target-Network更新的平滑的参数)
            batch_size (int): 256      (批量大小)
            Q_lr (float): 0.001         (学习率)
            pi_lr (float): 0.001        (学习率)
            norm_eps (float): 0.01      (为避免数据溢出使用)
            norm_clip (float): 5        (norm_clip)
            max_u (float): 1.0          (动作的范围是[-1.0, 1.0])
            action_l2 (float): 1.0      (Actor网络的损失正则项系数)
            clip_obs (float): 200       (obs限制在 (-200, +200))
            scope (str): "ddpg"         (tensorflow 使用的 scope 命名域)
            T (int): 50                 (周期的交互次数)
            rollout_batch_size (int): 2 (number of parallel rollouts per DDPG agent)
            subtract_goals (function):  对goal进行预处理的函数, 输入为a和b,输出a-b
            relative_goals (boolean):   False  (如果需要对goal进行函数subtract_goals处理,则为True)
            clip_pos_returns (boolean): True   (是否需要将正的return消除)
            clip_return (float): 50     (将return的范围限制在[-clip_return, clip_return])
            sample_transitions (function):  her返回的函数. 参数由 config.py 定义
            gamma (float): 0.98         (Q 网络更新时使用的折扣因子)

            其中 sample_transition 来自与 HER 的定义,是关键部分
        """

        if self.clip_return is None:
            self.clip_return = np.inf

        # 网络结构和计算图的创建由 actor_critic.py 文件完成
        self.create_actor_critic = import_function(self.network_class)

        # 提取维度
        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']  # 10
        self.dimg = self.input_dims['g']  # 4
        self.dimu = self.input_dims['u']  # 3
        # print("+++", input_shapes)    #  {'o': (10,), 'u': (4,), 'g': (3,), 'info_is_success': (1,)}

        # https://www.tensorflow.org/performance/performance_models
        # StagingArea 提供了更简单的功能且可在 CPU 和 GPU 中与其他阶段并行执行。
        #       将输入管道拆分为 3 个独立并行操作的阶段,并且这是可扩展的,充分利用大型的多核环境

        # 定义需要的存储变量. 假设 self.dimo=10, self.dimg=5, self.dimu=5
        # 则 state_shapes={'o':(None, 10), 'g':(None, 5), 'u':(None:5)}
        # 同时添加target网络使用的变量 state_shapes={'o_2':(None, 10), 'g_2': (None, 5)}
        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )  # 奖励为标量
        self.stage_shapes = stage_shapes
        # 执行后 self.stage_shapes =
        #       OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10)), ('g_2', (None, 3)), ('r', (None,))])
        # 其中包括 g, o, u、target网络中使用的 o_2, g_2 和奖励 r

        # Create network.
        # 根据 state_shape 创建 tf 变量,其中包括 g, o, u, o_2, g_2, r
        # self.buffer_ph_tf = [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>]
        with tf.variable_scope(self.scope):
            # 创建 StagingArea 变量
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            # 创建 Tensorflow 变量 placeholder
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            # 将 tensorflow 变量与 StagingArea 变量相互对应
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            #
            self._create_network(reuse=reuse)

        # 经验池相关操作
        # 当T = 50时,执行结束后 buffer_shapes=
        #         {'o': (51, 10), 'u': (50, 4), 'g': (50, 3), 'info_is_success': (50, 1), 'ag': (51, 3)}
        # 注意 a,g,u 均记录一个周期内经历的所有样本,因此为 50 维,但 o 和 ag 需要多1维 ????
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }  #
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)  #
        buffer_shapes['ag'] = (self.T + 1, self.dimg)  #
        # print("+++", buffer_shapes)

        # buffer_size 是按照样本进行计数的长度
        # self.buffer_size=1E6  self.rollout_batch_size=2 buffer_size=1E6
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)
示例#16
0
    def __init__(self,
                 buffer,
                 input_dims,
                 hidden,
                 layers,
                 polyak,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 gamma,
                 vloss_type='normal',
                 priority=False,
                 reuse=False,
                 **kwargs):
        """
        buffer (object): buffer to save transitions
        input_dims (dict of ints): dimensions for the observation (o), the goal (g), 
            and the actions (u)
        hidden (int): number of units in the hidden layers
        layers (int): number of hidden layers
        polyak (float): coefficient for Polyak-averaging of the target network
        Q_lr (float): learning rate for the Q (critic) network
        pi_lr (float): learning rate for the pi (actor) network
        norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
        norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
        max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
        action_l2 (float): coefficient for L2 penalty on the actions
        clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
        scope (str): the scope used for the TensorFlow graph
        subtract_goals (function): function that subtracts goals from each other
        relative_goals (boolean): whether or not relative goals should be fed into the network
        clip_pos_returns (boolean): whether or not positive returns should be clipped
        clip_return (float): clip returns to be in [-clip_return, clip_return]
        gamma (float): gamma used for Q learning updates
        vloss_type (str): value loss type, 'normal', 'tf_gamma', 'target'
        priority(boolean): use priority or not
        reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf
        self.dimo, self.dimg, self.dimu = self.input_dims[
            'o'], self.input_dims['g'], self.input_dims['u']
        self.stage_shapes = self.get_stage_shapes()
        self.init_target_net_op = None
        self.update_target_net_op = None

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            self._create_network(reuse=reuse)

        logger.log('value loss type: {}'.format(self.vloss_type))
示例#17
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 use_seperate_networks=False,
                 **kwargs):

        if self.clip_return is None:
            self.clip_return = np.inf

        if use_seperate_networks:
            self.create_naf_network = import_function(
                "her.naf_utils.naf_network_seperate:Network")
        else:
            self.create_naf_network = import_function(
                "her.naf_utils.naf_network_shared:Network")

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']
        self.counter = 0

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        global DEMO_BUFFER
        DEMO_BUFFER = ReplayBuffer(
            buffer_shapes, buffer_size, self.T, self.sample_transitions
        )  #initialize the demo buffer; in the same way as the primary data buffer
示例#18
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class_actor_critic,
                 network_class_discriminator,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 mi_lr,
                 sk_lr,
                 r_scale,
                 mi_r_scale,
                 sk_r_scale,
                 et_r_scale,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 env_name,
                 max_timesteps,
                 pretrain_weights,
                 finetune_pi,
                 mi_prioritization,
                 sac,
                 reuse=False,
                 history_len=10000,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(
            self.network_class_actor_critic)
        self.create_discriminator = import_function(
            self.network_class_discriminator)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimz = self.input_dims['z']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        self.env_name = env_name

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        stage_shapes['w'] = (None, )
        stage_shapes['m'] = (None, )
        stage_shapes['s'] = (None, )
        stage_shapes['m_w'] = ()
        stage_shapes['s_w'] = ()
        stage_shapes['r_w'] = ()
        stage_shapes['e_w'] = ()
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(pretrain_weights,
                                 mi_prioritization,
                                 reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size

        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions, mi_prioritization)

        self.mi_r_history = deque(maxlen=history_len)
        self.gl_r_history = deque(maxlen=history_len)
        self.sk_r_history = deque(maxlen=history_len)
        self.et_r_history = deque(maxlen=history_len)
        self.mi_current = 0
        self.finetune_pi = finetune_pi
示例#19
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 time_horizon,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False):
        """
        Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u)
        :param buffer_size: (int) number of transitions that are stored in the replay buffer
        :param hidden: (int) number of units in the hidden layers
        :param layers: (int) number of hidden layers
        :param network_class: (str) the network class that should be used (e.g. 'baselines.her.ActorCritic')
        :param polyak: (float) coefficient for Polyak-averaging of the target network
        :param batch_size: (int) batch size for training
        :param q_lr: (float) learning rate for the Q (critic) network
        :param pi_lr: (float) learning rate for the pi (actor) network
        :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities
        :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip]
        :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u]
        :param action_l2: (float) coefficient for L2 penalty on the actions
        :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs]
        :param scope: (str) the scope used for the TensorFlow graph
        :param time_horizon: (int) the time horizon for rollouts
        :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent
        :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals
            from each other
        :param relative_goals: (boolean) whether or not relative goals should be fed into the network
        :param clip_pos_returns: (boolean) whether or not positive returns should be clipped
        :param clip_return: (float) clip returns to be in [-clip_return, clip_return]
        :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer
        :param gamma: (float) gamma used for Q learning updates
        :param reuse: (boolean) whether or not the networks should be reused
        """
        # Updated in experiments/config.py
        self.input_dims = input_dims
        self.buffer_size = buffer_size
        self.hidden = hidden
        self.layers = layers
        self.network_class = network_class
        self.polyak = polyak
        self.batch_size = batch_size
        self.q_lr = q_lr
        self.pi_lr = pi_lr
        self.norm_eps = norm_eps
        self.norm_clip = norm_clip
        self.max_u = max_u
        self.action_l2 = action_l2
        self.clip_obs = clip_obs
        self.scope = scope
        self.time_horizon = time_horizon
        self.rollout_batch_size = rollout_batch_size
        self.subtract_goals = subtract_goals
        self.relative_goals = relative_goals
        self.clip_pos_returns = clip_pos_returns
        self.clip_return = clip_return
        self.sample_transitions = sample_transitions
        self.gamma = gamma
        self.reuse = reuse

        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dim_obs = self.input_dims['o']
        self.dim_goal = self.input_dims['g']
        self.dim_action = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.time_horizon if key != 'o' else self.time_horizon + 1,
                  *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal)
        buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size,
                                   self.time_horizon, self.sample_transitions)
示例#20
0
    def __init__(self,
                 input_dims,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 normalize_obs,
                 sample_transitions,
                 gamma,
                 buffers=None,
                 reuse=False,
                 tasks_ag_id=None,
                 tasks_g_id=None,
                 task_replay='',
                 t_id=None,
                 eps_task=None,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused,
            buffers (list): buffers to be used to store new transition (usually one per task + 1
            task_ag_id (list): indices to find achieved goals for each task in the achieved goal vector
            task_g_id (list): indices to find agoals for each task in the goal vector
            task_replay (str): defines the task replay strategy (see train.py for info)
            t_id (int): index of the task corresponding to this policy when using a task-experts structure
            eps_task (float): epsilon parameter for the epsilon greedy strategy (task choice)
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)
        self.normalize_obs = normalize_obs

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimag = self.input_dims['ag']
        self.dimu = self.input_dims['u']
        if self.structure == 'curious' or self.structure == 'task_experts':
            self.dimtd = self.input_dims['task_descr']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, 1)
        self.stage_shapes = stage_shapes

        if t_id is not None:
            self.scope += str(t_id)
        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # addition for multi-task structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            self.tasks_g_id = tasks_g_id
            self.tasks_ag_id = tasks_ag_id
            self.nb_tasks = len(tasks_g_id)

        if buffers is not None:
            self.buffer = buffers
            if type(self.buffer) is list:
                if len(self.buffer) > 5:
                    # distractor buffers are equal
                    for i in range(6, len(self.buffer)):
                        self.buffer[i] = self.buffer[5]
        self.first = True
示例#21
0
    def __init__(self, FLAGS, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight,
                 
                #  sample_transitions, gamma, reuse=False, **kwargs):
                 sample_transitions, gamma, td3_policy_freq, td3_policy_noise, td3_noise_clip, reuse=False, *agent_params, **kwargs): ##
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss

            agent_params: for HAC agent params
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        # self.dimo1= self.input_dims['o1'] ##A.R add for TD3 (has obs0, obs1)
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']
        #추가된 내용
        #parameters for using TD3 variant of DDPG
        #https://arxiv.org/abs/1802.09477
        self.td3_policy_freq = td3_policy_freq
        self.td3_policy_noise = td3_policy_noise
        self.td3_noise_clip = td3_noise_clip

        ## for HAC
        self.FLAGS = FLAGS

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
        # for key in ['o', 'o1', 'g']: #o1 added by A.R
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) 
        # origin : buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) 
        # buffer_shapes = {key: (self.T-1 if key != 'o' and key != 'o1' else self.T, *input_shapes[key]) #A.Rㅇ
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

        global DEMO_BUFFER
        DEMO_BUFFER = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer
        print("@ ddgp.py , buffer={}".format(self.buffer))
示例#22
0
文件: ddpg.py 项目: s-bl/cwyc
    def __init__(self,
                 env_spec,
                 task_spec,
                 buffer_size,
                 network_params,
                 normalizer_params,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 random_eps,
                 noise_eps,
                 train_steps,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 replay_strategy,
                 replay_k,
                 noise_type,
                 share_experience,
                 noise_adaptation,
                 reuse=False):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        super().__init__(scope)
        self.replay_k = replay_k
        self.replay_strategy = replay_strategy
        self.clip_pos_returns = clip_pos_returns
        self.relative_goals = relative_goals
        self.train_steps = train_steps
        self.noise_eps = noise_eps
        self.random_eps = random_eps
        self.clip_obs = clip_obs
        self.action_l2 = action_l2
        self.max_u = max_u
        self.pi_lr = pi_lr
        self.Q_lr = Q_lr
        self.batch_size = batch_size
        self.normalizer_params = normalizer_params
        self.polyak = polyak
        self.buffer_size = buffer_size
        self._env_spec = env_spec
        self._T = self._env_spec['T']
        self._task_spec = task_spec
        self.network_params = network_params
        self._share_experience = share_experience
        self._noise_adaptation = noise_adaptation

        self._task_spec = deepcopy(task_spec)
        self._task_spec['buffer_size'] = 0
        self._task = Task(**self._task_spec)

        self._gamma = 1. - 1. / self._T
        self.clip_return = (1. / (1. - self._gamma)) if clip_return else np.inf

        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(network_params['net_type'])

        self.input_dims = dict(
            o=self._env_spec['o_dim'],
            a=self._env_spec['a_dim'],
            g=self._task_spec['g_dim'],
        )

        input_shapes = dims_to_shapes(self.input_dims)

        self.dimo = self._env_spec['o_dim']
        self.dimg = self._task_spec['g_dim']
        self.dima = self._env_spec['a_dim']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_next'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        self._action_noise, self._parameter_noise = get_noise_from_string(
            self._env_spec, noise_type)

        # Create network.
        with tf.variable_scope(self._scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        buffer_shapes = dict()
        buffer_shapes['o'] = (self.dimo, )
        buffer_shapes['o_next'] = buffer_shapes['o']
        buffer_shapes['g'] = (self.dimg, )
        buffer_shapes['ag'] = (self.dimg, )
        buffer_shapes['ag_next'] = (self.dimg, )
        buffer_shapes['a'] = (self.dima, )

        self.sample_transitions = make_sample_her_transitions(
            self.replay_strategy, self.replay_k,
            self._task.reward_done_success)

        self._buffer = ReplayBuffer(buffer_shapes, self.buffer_size, self._T,
                                    self.sample_transitions)
示例#23
0
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 pre_train_model=False,
                 update_model=True,
                 feature_net_path='',
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.
        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxiliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxiliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        # ADDED
        self.use_contact = (self.contact_dim > 0)
        self.pre_train_model = pre_train_model
        self.feature_net_path = feature_net_path
        self.process_type = kwargs['process_type']
        self.contact_dim = kwargs['contact_dim']
        self.__dict__['use_contact'] = self.use_contact
        self.__dict__['pre_train'] = self.pre_train_model

        self.create_actor_critic = import_function(self.network_class)
        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o'] - self.contact_dim
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']
        self.feature_dim = kwargs['feature_dim']
        self.contact_point_dim = self.contact_dim // self.fixed_num_of_contact

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            logger.info("Creating a DDPG agent with action space %d x %s..." %
                        (self.dimu, self.max_u))
            self.sess = tf_util.get_session()
            # order: ['g', 'o', 'u', 'o_2', 'g_2', 'r'])
            if self.pre_train_model == 'cpc':
                self.staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                    shapes=list(self.stage_shapes.values()))
                self.buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.stage_shapes.values()
                ]
                self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

                self.cpc_shape = OrderedDict()
                self.cpc_shape['obs_neg'] = (None, self.fixed_num_of_contact,
                                             self.contact_point_dim)
                self.cpc_shape['obs_pos'] = (None, self.fixed_num_of_contact,
                                             self.contact_point_dim)
                self.cpc_staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.cpc_shape.keys()],
                    shapes=list(self.cpc_shape.values()))
                self.cpc_buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.cpc_shape.values()
                ]
                self.cpc_stage_op = self.cpc_staging_tf.put(
                    self.cpc_buffer_ph_tf)
            else:
                self.staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                    shapes=list(self.stage_shapes.values()))
                self.buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.stage_shapes.values()
                ]
                self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            self.update_model = update_model

            if self.pre_train_model != 'none':
                self.__dict__['feature_net_path'] = self.feature_net_path
                self.__dict__['clip_obs'] = self.clip_obs

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        self.ep_ctr = 0
        self.hist_bins = 50
        self.draw_hist_freq = 3
        self._reset_hists()
        self.shared_pi_err_coeff = kwargs['shared_pi_err_coeff']

        HRL_Policy.__init__(self, input_dims, T, rollout_batch_size, **kwargs)

        self.hidden = hidden
        self.layers = layers
        self.max_u = max_u
        self.network_class = network_class
        self.sample_transitions = sample_transitions
        self.scope = scope
        self.subtract_goals = subtract_goals
        self.relative_goals = relative_goals
        self.clip_obs = clip_obs
        self.Q_lr = Q_lr
        self.pi_lr = pi_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.clip_pos_returns = clip_pos_returns
        self.gamma = gamma
        self.polyak = polyak
        self.clip_return = clip_return
        self.norm_eps = norm_eps
        self.norm_clip = norm_clip
        self.action_l2 = action_l2
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)
        self.stage_shapes['gamma'] = (None, )
        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key:
            (self.T if key != 'o' else self.T + 1, *self.input_shapes[key])
            for key, val in self.input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        buffer_shapes['p'] = (buffer_shapes['g'][0], 1)
        buffer_shapes['steps'] = buffer_shapes['p']
        buffer_size = self.buffer_size  #// self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        self.preproc_lr = (self.Q_lr + self.pi_lr) / 2