Пример #1
0
 def process_pc(self, batch):
     """
     Returns feed dictionary for `pixel control` loss estimation subgraph.
     """
     if not self.use_off_policy_aac:  # use single pass of network on same off-policy batch
         feeder = {
             pl: value
             for pl, value in zip(
                 self.local_network.pc_lstm_state_pl_flatten,
                 flatten_nested(batch['context']))
         }
         feeder.update({
             self.local_network.pc_state_in:
             batch['state'],
             self.local_network.pc_a_r_in:
             batch['last_action_reward'],
             self.pc_action:
             batch['action'],
             self.pc_target:
             batch['pixel_change']
         })
     else:
         feeder = {
             self.pc_action: batch['action'],
             self.pc_target: batch['pixel_change']
         }
     return feeder
Пример #2
0
    def __init__(self, x_in, ob_space, ac_space, lstm_class, lstm_layers):

        # Flatten end expand with fake time dim to feed to LSTM bank:
        x = tf.expand_dims(batch_flatten(x_in), [0])
        # x = tf.expand_dims(self.flatten_homebrew(x_in), [0])
        try:
            if self.train_phase is not None:
                pass

        except:
            self.train_phase = tf.placeholder_with_default(
                tf.constant(False, dtype=tf.bool),
                shape=(),
                name='train_phase_flag_pl'
            )
        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        #print('GOT HERE 2, x:', x.shape)
        #print('GOT HERE 2, train_phase:', self.train_phase.shape)
        #print('GOT HERE 2, update_ops:', self.update_ops)

        # Define LSTM layers:
        lstm = []
        for size in lstm_layers:
            lstm += [lstm_class(size, state_is_tuple=True)]

        self.lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True)
        # self.lstm = lstm[0]

        # Get time_dimension as [1]-shaped tensor:
        step_size = tf.expand_dims(tf.shape(x)[1], [0])
        #step_size = tf.shape(self.x)[:1]
        #print('GOT HERE 3')
        self.lstm_init_state = self.lstm.zero_state(1, dtype=tf.float32)

        lstm_state_pl = self.rnn_placeholders(self.lstm.zero_state(1, dtype=tf.float32))
        self.lstm_state_pl_flatten = flatten_nested(lstm_state_pl)

        #print('GOT HERE 4, x:', x.shape)
        lstm_outputs, self.lstm_state_out = tf.nn.dynamic_rnn(
            self.lstm,
            x,
            initial_state=lstm_state_pl,
            sequence_length=step_size,
            time_major=False
        )
        #print('GOT HERE 5')
        x = tf.reshape(lstm_outputs, [-1, lstm_layers[-1]])

        self.logits = self.linear(x, ac_space, "action", self.normalized_columns_initializer(0.01))
        self.vf = tf.reshape(self.linear(x, 1, "value", self.normalized_columns_initializer(1.0)), [-1])
        self.sample = self.categorical_sample(self.logits, ac_space)[0, :]
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        # Add moving averages to save list (meant for Batch_norm layer):
        moving_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*moving.*')
        renorm_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*renorm.*')

        self.var_list += moving_var_list + renorm_var_list
Пример #3
0
 def value(self, ob, lstm_state):
     sess = tf.get_default_session()
     feeder = {
         pl: value
         for pl, value in zip(self.lstm_state_pl_flatten,
                              flatten_nested(lstm_state))
     }
     feeder.update({self.x: [ob], self.train_phase: False})
     return sess.run(self.vf, feeder)[0]
Пример #4
0
 def act(self, ob, lstm_state):
     sess = tf.get_default_session()
     feeder = {
         pl: value
         for pl, value in zip(self.lstm_state_pl_flatten,
                              flatten_nested(lstm_state))
     }
     feeder.update({self.x: [ob], self.train_phase: False})
     #print('#####_feeder:\n', feeder)
     return sess.run([self.sample, self.vf, self.lstm_state_out], feeder)
Пример #5
0
 def get_a3c_value(self, observation, lstm_state, action_reward):
     """Called by thread-runner."""
     sess = tf.get_default_session()
     feeder = {pl: value for pl, value in zip(self.a3c_lstm_state_pl_flatten, flatten_nested(lstm_state))}
     feeder.update(
         {self.a3c_state_in: [observation],
          self.a3c_a_r_in: [action_reward],
          self.train_phase: False}
     )
     return sess.run(self.a3c_vf, feeder)[0]
Пример #6
0
    def process(self, sess):
        """
        Algorithm single training step.
        Grabs an on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """

        # Copy weights from shared to local new_policy:
        sess.run(self.sync)

        # Get and process rollout:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy AAC loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.on_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch['context']))
        }

        feed_dict.update({
            self.local_network.on_state_in:
            on_policy_batch['state'],
            self.local_network.on_a_r_in:
            on_policy_batch['last_action_reward'],
            self.on_pi_act_target:
            on_policy_batch['action'],
            self.on_pi_adv_target:
            on_policy_batch['advantage'],
            self.on_pi_r_target:
            on_policy_batch['r'],
            self.local_network.train_phase:
            True,
        })

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0

        if should_compute_summary:
            fetches = [self.train_op, self.model_summary_op, self.inc_step]
        else:
            fetches = [self.train_op, self.inc_step]

        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1
Пример #7
0
def feed_dict_rnn_context(placeholders, values):
    """
    Creates tf.feed_dict for flat placeholders and nested values.

    Args:
        placeholders:       flat structure of placeholders
        values:             nested structure of values

    Returns:
        flat feed dictionary
    """
    return {key: value for key, value in zip(placeholders, flatten_nested(values))}
Пример #8
0
def flat_placeholders(ob_space, batch_dim=None, name='flt'):
    """
    Given nested observation space as dictionary of shape tuples,
    returns flattened dictionary of batch-wise placeholders.

    Args:
        ob_space:   [nested dict] of tuples
        name:       name_scope
        batch_dim:  batch dimension
    Returns:
        flat dictionary of tf.placeholders
    """
    return flatten_nested(nested_placeholders(ob_space, batch_dim=batch_dim, name=name))
Пример #9
0
    def process(self, sess):
        """
        Grabs a rollout that's been produced by the thread runner,
        and updates the parameters.  The update is then sent to the parameter
        server.
        """
        sess.run(self.sync)  # copy weights from shared to local
        rollout = self.pull_batch_from_queue()
        batch = process_rollout(rollout,
                                gamma=self.model_gamma,
                                lambda_=self.model_lambda)

        # Only chief worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0   # self.task == 0 and

        if should_compute_summary:
            fetches = [self.model_summary_op, self.train_op, self.global_step]
        else:
            fetches = [self.train_op, self.global_step]

        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.lstm_state_pl_flatten,
                                 flatten_nested(batch.features))
        }
        feed_dict.update({
            self.local_network.x: batch.si,
            self.ac: batch.a,
            self.adv: batch.adv,
            self.r: batch.r,
            self.local_network.train_phase: True,
        })

        #print('TRAIN_FEED_DICT:\n', feed_dict)
        #print('\n=======S=======\n')
        #for key,value in feed_dict.items():
        #    try:
        #        print(key,':', value.shape,'\n')
        #    except:
        #        print(key, ':', value, '\n')
        #print('\n=====E======\n')

        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1
Пример #10
0
def lstm_network(x,
                 lstm_sequence_length,
                 lstm_class=rnn.BasicLSTMCell,
                 lstm_layers=(256, ),
                 name='lstm',
                 reuse=False,
                 **kwargs):
    """
    Stage2 network: from features to flattened LSTM output.
    Defines [multi-layered] dynamic [possibly shared] LSTM network.

    Returns:
         batch-wise flattened output tensor;
         lstm initial state tensor;
         lstm state output tensor;
         lstm flattened feed placeholders as tuple.
    """
    with tf.variable_scope(name, reuse=reuse):
        # Flatten, add action/reward and expand with fake [time] batch? dim to feed LSTM bank:
        #x = tf.concat([x, a_r] ,axis=-1)
        #x = tf.concat([batch_flatten(x), a_r], axis=-1)
        #x = tf.expand_dims(x, [0])

        # Define LSTM layers:
        lstm = []
        for size in lstm_layers:
            lstm += [lstm_class(size, state_is_tuple=True)]

        lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True)
        # Get time_dimension as [1]-shaped tensor:
        step_size = tf.expand_dims(tf.shape(x)[1], [0])

        lstm_init_state = lstm.zero_state(1, dtype=tf.float32)

        lstm_state_pl = rnn_placeholders(lstm.zero_state(1, dtype=tf.float32))
        lstm_state_pl_flatten = flatten_nested(lstm_state_pl)

        lstm_outputs, lstm_state_out = tf.nn.dynamic_rnn(
            lstm,
            x,
            initial_state=lstm_state_pl,
            sequence_length=lstm_sequence_length,
            time_major=False)
        #x_out = tf.reshape(lstm_outputs, [-1, lstm_layers[-1]])
        x_out = lstm_outputs
    return x_out, lstm_init_state, lstm_state_out, lstm_state_pl_flatten
Пример #11
0
 def process_pc(self, batch):
     """
     Returns feed dictionary for `pixel control` loss estimation subgraph.
     """
     if not self.use_off_policy_a3c:  # use single pass of network on same off-policy batch
         feeder = {
             pl: value
             for pl, value in zip(
                 self.local_network.pc_lstm_state_pl_flatten,
                 flatten_nested(batch.features))
         }
         feeder.update({
             self.local_network.pc_state_in: batch.si,
             self.local_network.pc_a_r_in: batch.last_ar,
             self.pc_action: batch.a,
             self.pc_target: batch.pc
         })
     else:
         feeder = {self.pc_action: batch.a, self.pc_target: batch.pc}
     return feeder
Пример #12
0
 def process_vr(self, batch):
     """
     Returns feed dictionary for `value replay` loss estimation subgraph.
     """
     if not self.use_off_policy_aac:  # use single pass of network on same off-policy batch
         feeder = {
             pl: value
             for pl, value in zip(
                 self.local_network.vr_lstm_state_pl_flatten,
                 flatten_nested(batch['context']))
         }  # ...passes lstm context
         feeder.update({
             self.local_network.vr_state_in:
             batch['state'],
             self.local_network.vr_a_r_in:
             batch['last_action_reward'],
             self.vr_target:
             batch['r'],
         })
     else:
         feeder = {self.vr_target: batch['r']}  # redundant actually :)
     return feeder
Пример #13
0
 def process_vr(self, batch):
     """
     Returns feed dictionary for `value replay` loss estimation subgraph.
     """
     if not self.use_off_policy_a3c:  # use single pass of network on same off-policy batch
         feeder = {
             pl: value
             for pl, value in zip(
                 self.local_network.vr_lstm_state_pl_flatten,
                 flatten_nested(batch.features))
         }  # ...passes lstm context
         feeder.update({
             self.local_network.vr_state_in:
             batch.si,
             self.local_network.vr_a_r_in:
             batch.last_ar,
             #self.vr_action: batch.a,  # don't need those for value fn. estimation
             #self.vr_advantage: batch.adv, # neither..
             self.vr_target:
             batch.r,
         })
     else:
         feeder = {self.vr_target: batch.r}  # redundant actually :)
     return feeder
Пример #14
0
def lstm_network(
        x,
        lstm_sequence_length,
        lstm_class=rnn.BasicLSTMCell,
        lstm_layers=(256,),
        static=False,
        name='lstm',
        reuse=False,
        **kwargs
    ):
    """
    Stage2 network: from features to flattened LSTM output.
    Defines [multi-layered] dynamic [possibly shared] LSTM network.

    Returns:
         batch-wise flattened output tensor;
         lstm initial state tensor;
         lstm state output tensor;
         lstm flattened feed placeholders as tuple.
    """
    with tf.variable_scope(name, reuse=reuse):
        # Prepare rnn type:
        if static:
            rnn_net = tf.nn.static_rnn
            # Remove time dimension (suppose always get one) and wrap to list:
            x = [x[:, 0, :]]

        else:
            rnn_net = tf.nn.dynamic_rnn
        # Define LSTM layers:
        lstm = []
        for size in lstm_layers:
            lstm += [lstm_class(size)] #, state_is_tuple=True)]

        lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True)
        # Get time_dimension as [1]-shaped tensor:
        step_size = tf.expand_dims(tf.shape(x)[1], [0])

        lstm_init_state = lstm.zero_state(1, dtype=tf.float32)

        lstm_state_pl = rnn_placeholders(lstm.zero_state(1, dtype=tf.float32))
        lstm_state_pl_flatten = flatten_nested(lstm_state_pl)

        # print('rnn_net: ', rnn_net)

        lstm_outputs, lstm_state_out = rnn_net(
            cell=lstm,
            inputs=x,
            initial_state=lstm_state_pl,
            sequence_length=lstm_sequence_length,
        )

        # print('\nlstm_outputs: ', lstm_outputs)
        # print('\nlstm_state_out:', lstm_state_out)

        # Unwrap and expand:
        if static:
            x_out = lstm_outputs[0][:, None, :]
        else:
            x_out = lstm_outputs
        state_out = lstm_state_out
    return x_out, lstm_init_state, state_out, lstm_state_pl_flatten
Пример #15
0
    def process(self, sess):
        """
        Grabs a on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """
        sess.run(self.sync)  # copy weights from shared to local

        # Get and process on_policy_rollout for A3C train step:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy A3C loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.a3c_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch.features))
        }  # ..passes lstm context
        feed_dict.update({
            self.local_network.a3c_state_in: on_policy_batch.si,
            self.local_network.a3c_a_r_in: on_policy_batch.last_ar,
            self.a3c_act_target: on_policy_batch.a,
            self.a3c_adv_target: on_policy_batch.adv,
            self.a3c_r_target: on_policy_batch.r,
            self.local_network.train_phase: True,
        })

        if self.use_off_policy_a3c or self.use_pixel_control or self.use_value_replay:
            # Get sample from replay memory:
            if self.use_rebalanced_replay:
                off_policy_sample = self.memory.sample_priority(
                    self.replay_rollout_length,
                    skewness=self.rebalance_skewness,
                    exact_size=False)
            else:
                off_policy_sample = self.memory.sample_uniform(
                    self.replay_rollout_length)

            off_policy_rollout = Rollout()
            off_policy_rollout.add_memory_sample(off_policy_sample)
            off_policy_batch = off_policy_rollout.process(
                gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

            # Feeder for off-policy A3C loss estimation graph:
            off_policy_feeder = {
                pl: value
                for pl, value in zip(
                    self.local_network.off_a3c_lstm_state_pl_flatten,
                    flatten_nested(off_policy_batch.features))
            }
            off_policy_feeder.update({
                self.local_network.off_a3c_state_in:
                off_policy_batch.si,
                self.local_network.off_a3c_a_r_in:
                off_policy_batch.last_ar,
                self.off_policy_act_target:
                off_policy_batch.a,
                self.off_policy_adv_target:
                off_policy_batch.adv,
                self.off_policy_r_target:
                off_policy_batch.r,
            })
            feed_dict.update(off_policy_feeder)

        # Update with reward prediction subgraph:
        if self.use_reward_prediction:
            # Rebalanced 50/50 sample for RP:
            rp_sample = self.memory.sample_priority(self.rp_sequence_size,
                                                    skewness=2,
                                                    exact_size=True)
            feed_dict.update(self.process_rp(rp_sample))

        # Pixel control ...
        if self.use_pixel_control:
            feed_dict.update(self.process_pc(off_policy_batch))

        # VR...
        if self.use_value_replay:
            feed_dict.update(self.process_vr(off_policy_batch))

        if self.use_memory:
            # Save on_policy_rollout to replay memory:
            self.memory.add_rollout(on_policy_rollout)

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0   # self.task == 0 and

        if should_compute_summary:
            fetches = [self.model_summary_op, self.train_op, self.global_step]
        else:
            fetches = [self.train_op, self.global_step]

        #print('TRAIN_FEED_DICT:\n', feed_dict)
        #print('\n=======S=======\n')
        #for key,value in feed_dict.items():
        #    try:
        #        print(key,':', value.shape,'\n')
        #    except:
        #        print(key, ':', value, '\n')
        #print('\n=====E======\n')

        # And finally...
        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1
Пример #16
0
    def process(self, sess):
        """
        Grabs a on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """

        # Copy weights from shared to local new_policy:
        sess.run(self.sync)

        # Get and process rollout for on-policy train step:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy AAC loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.on_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch['context']))
        }
        feed_dict.update({
            self.local_network.on_state_in:
            on_policy_batch['state'],
            self.local_network.on_a_r_in:
            on_policy_batch['last_action_reward'],
            self.on_pi_act_target:
            on_policy_batch['action'],
            self.on_pi_adv_target:
            on_policy_batch['advantage'],
            self.on_pi_r_target:
            on_policy_batch['r'],
            self.local_network.train_phase:
            True,
        })

        if self.use_off_policy_aac or self.use_pixel_control or self.use_value_replay:
            # Get sample from replay memory:
            if self.use_rebalanced_replay:
                off_policy_sample = self.memory.sample_priority(
                    self.replay_rollout_length,
                    skewness=self.rebalance_skewness,
                    exact_size=False)
            else:
                off_policy_sample = self.memory.sample_uniform(
                    self.replay_rollout_length)

            off_policy_rollout = Rollout()
            off_policy_rollout.add_memory_sample(off_policy_sample)
            off_policy_batch = off_policy_rollout.process(
                gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

            # Feeder for off-policy AAC loss estimation graph:
            off_policy_feeder = {
                pl: value
                for pl, value in zip(
                    self.local_network.off_lstm_state_pl_flatten,
                    flatten_nested(off_policy_batch['context']))
            }

            off_policy_feeder.update({
                self.local_network.off_state_in:
                off_policy_batch['state'],
                self.local_network.off_a_r_in:
                off_policy_batch['last_action_reward'],
                self.off_pi_act_target:
                off_policy_batch['action'],
                self.off_pi_adv_target:
                off_policy_batch['advantage'],
                self.off_pi_r_target:
                off_policy_batch['r'],
            })
            feed_dict.update(off_policy_feeder)

        # Update with reward prediction subgraph:
        if self.use_reward_prediction:
            # Rebalanced 50/50 sample for RP:
            rp_sample = self.memory.sample_priority(self.rp_sequence_size,
                                                    skewness=2,
                                                    exact_size=True)
            feed_dict.update(self.process_rp(rp_sample))

        # Pixel control ...
        if self.use_pixel_control:
            feed_dict.update(self.process_pc(off_policy_batch))

        # VR...
        if self.use_value_replay:
            feed_dict.update(self.process_vr(off_policy_batch))

        if self.use_memory:
            # Save on_policy_rollout to replay memory:
            self.memory.add_rollout(on_policy_rollout)

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0

        fetches = [self.train_op]

        if should_compute_summary:
            fetches = [self.train_op, self.model_summary_op, self.inc_step]
        else:
            fetches = [self.train_op, self.inc_step]

        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1