def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma):
    """
    Calculates the target Q-retrace

    :param rewards: ([TensorFlow Tensor]) The rewards
    :param dones: ([TensorFlow Tensor])
    :param q_i: ([TensorFlow Tensor]) The Q values for actions taken
    :param values: ([TensorFlow Tensor]) The output of the value functions
    :param rho_i: ([TensorFlow Tensor]) The importance weight for each action
    :param n_envs: (int) The number of environments
    :param n_steps: (int) The number of steps to run for each environment
    :param gamma: (float) The discount value
    :return: ([TensorFlow Tensor]) the target Q-retrace
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps, True)  # list of len steps, shape [n_envs]
    reward_seq = batch_to_seq(rewards, n_envs, n_steps, True)  # list of len steps, shape [n_envs]
    done_seq = batch_to_seq(dones, n_envs, n_steps, True)  # list of len steps, shape [n_envs]
    q_is = batch_to_seq(q_i, n_envs, n_steps, True)
    value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True)
    final_value = value_sequence[-1]
    qret = final_value
    qrets = []
    for i in range(n_steps - 1, -1, -1):
        check_shape([qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i], value_sequence[i]], [[n_envs]] * 6)
        qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i])
        qrets.append(qret)
        qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret
예제 #2
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None,
                 net_arch=None, layer_norm=False, feature_extraction="cnn",
                 **kwargs):
        # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM
        super(RelationalLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch,
                                                   state_shape=(2 * n_lstm, ), reuse=reuse,
                                                   scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        with tf.variable_scope("model", reuse=reuse):
            print('self.processed_obs', self.processed_obs)
            relation_block_output = self.relation_block(self.processed_obs)

            # original code
            input_sequence = batch_to_seq(relation_block_output, self.n_env, n_steps)
            print('input_sequence', input_sequence)
            masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                         layer_norm=layer_norm)
            rnn_output = seq_to_batch(rnn_output)
            value_fn = linear(rnn_output, 'vf', 1)
            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

        self._value_fn = value_fn

        self._setup_init()
예제 #3
0
파일: policies.py 프로젝트: guytenn/Act2Vec
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 n_lstm=256,
                 reuse=False,
                 layers=None,
                 cnn_extractor=nature_cnn,
                 layer_norm=False,
                 feature_extraction="cnn",
                 **kwargs):
        super(LstmPolicy, self).__init__(sess,
                                         ob_space,
                                         ac_space,
                                         n_env,
                                         n_steps,
                                         n_batch,
                                         n_lstm,
                                         reuse,
                                         scale=(feature_extraction == "cnn"))

        if layers is None:
            layers = [64, 64]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                extracted_features = cnn_extractor(
                    (self.processed_obs, self.processed_obs_len), **kwargs)
            else:
                activ = tf.tanh
                extracted_features = tf.layers.flatten(self.processed_obs)
                for i, layer_size in enumerate(layers):
                    extracted_features = activ(
                        linear(extracted_features,
                               'pi_fc' + str(i),
                               n_hidden=layer_size,
                               init_scale=np.sqrt(2)))
            input_sequence = batch_to_seq(extracted_features, self.n_env,
                                          n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=layer_norm)
            rnn_output = seq_to_batch(rnn_output)
            value_fn = linear(rnn_output, 'vf', 1)

            self.proba_distribution, self.policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

        self.value_fn = value_fn
        self.initial_state = np.zeros((self.n_env, n_lstm * 2),
                                      dtype=np.float32)
        self._setup_init()
예제 #4
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        super(RnnPolicy, self).__init__(ob_space, ac_space, hidsize, ob_mean,
                                        ob_std, feat_dim, layernormalize, nl,
                                        n_env, n_steps, reuse, n_lstm, scope)

        with tf.variable_scope(scope, reuse=self.reuse):
            ## Use features
            x = self.flat_features

            input_sequence = batch_to_seq(x, self.n_env, self.n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=False)
            rnn_output = seq_to_batch(rnn_output)
            layernorm(rnn_output)

            ## Concat
            q = self.flat_features
            q = tf.concat([q, rnn_output], axis=1)
            q = fc(q, units=hidsize, activation=activ, name="fc1")
            q = fc(q, units=hidsize, activation=activ, name="fc2")

            pdparam, vpred = self.get_pdparam(q)

        self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh)
        self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0]
        self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam)
        self.a_samp = pd.sample()
        self.entropy = pd.entropy()
        self.nlp_samp = pd.neglogp(self.a_samp)
예제 #5
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None,
                 cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs):
        # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
        #                                  scale=(feature_extraction == "cnn"))
        # add this function to LstmPolicy to init ActorCriticPolicy
        self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, feature_extraction)

        with tf.variable_scope("model", reuse=reuse):
            extracted_features = cnn_extractor(self.processed_x, **kwargs)  # # [B,H,W,Deepth]
            print('extracted_features', extracted_features)
            coor = get_coor(extracted_features)
            # [B,Height,W,D+2]
            entities = tf.concat([extracted_features, coor], axis=3)
            print('entities:', entities)
            # [B,H*W,num_heads,Deepth=D+2]
            cin_output, attentions = CIN(entities, 'CIN')
            self.attention = attentions[0]
            print('CIN:', cin_output)
            # max_pooling
            # cin_maxpooling_output = tf.reduce_max(cin_output, axis=[1])
            # cin_output = tf.reshape(cin_output, [-1, 588, 2 * 66])
            maxpooling_shape = entities.shape[1] * entities.shape[2]
            print('maxpooling_shape:', maxpooling_shape)
            cin_maxpooling_output = tf.nn.pool(cin_output, window_shape=[maxpooling_shape], padding='VALID', strides=[maxpooling_shape], pooling_type="MAX")

            print('cin_maxpooling_output', cin_maxpooling_output)
            input_sequence = batch_to_seq(cin_maxpooling_output, self.n_env, n_steps)
            # input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                         layer_norm=layer_norm)

            rnn_output = seq_to_batch(rnn_output)
            # print('rnn_output', rnn_output, '      snew', self.snew)

            value_fn = linear(rnn_output, 'vf', 1)

            self.proba_distribution, self.policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

        self.value_fn = value_fn
        self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32)
        self._setup_init()
예제 #6
0
def strip(var, n_envs, n_steps, flat=False):
    """
    Removes the last step in the batch

    :param var: (TensorFlow Tensor) The input Tensor
    :param n_envs: (int) The number of environments
    :param n_steps: (int) The number of steps to run for each environment
    :param flat: (bool) If the input Tensor is flat
    :return: (TensorFlow Tensor) the input tensor, without the last step in the batch
    """
    out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat)
    return seq_to_batch(out_vars[:-1], flat)
예제 #7
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 n_lstm=256,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 layer_norm=False,
                 feature_extraction="cnn",
                 **kwargs):
        # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM
        super(LstmPolicy, self).__init__(sess,
                                         ob_space,
                                         ac_space,
                                         n_env,
                                         n_steps,
                                         n_batch,
                                         state_shape=(2 * n_lstm, ),
                                         reuse=reuse,
                                         scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if net_arch is None:  # Legacy mode
            if layers is None:
                layers = [64, 64]
            else:
                warnings.warn(
                    "The layers parameter is deprecated. Use the net_arch parameter instead."
                )

            with tf.variable_scope("model", reuse=reuse):
                if feature_extraction == "cnn":
                    extracted_features = cnn_extractor(self.processed_obs,
                                                       **kwargs)
                else:
                    extracted_features = tf.layers.flatten(self.processed_obs)
                    for i, layer_size in enumerate(layers):
                        extracted_features = act_fun(
                            linear(extracted_features,
                                   'pi_fc' + str(i),
                                   n_hidden=layer_size,
                                   init_scale=np.sqrt(2)))
                input_sequence = batch_to_seq(extracted_features, self.n_env,
                                              n_steps)
                masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                rnn_output, self.snew = lstm(input_sequence,
                                             masks,
                                             self.states_ph,
                                             'lstm1',
                                             n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output = seq_to_batch(rnn_output)
                value_fn = linear(rnn_output, 'vf', 1)

                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

            self._value_fn = value_fn
        else:  # Use the new net_arch parameter
            if layers is not None:
                warnings.warn(
                    "The new net_arch parameter overrides the deprecated layers parameter."
                )
            if feature_extraction == "cnn":
                raise NotImplementedError()

            with tf.variable_scope("model", reuse=reuse):
                latent = tf.layers.flatten(self.processed_obs)
                policy_only_layers = [
                ]  # Layer sizes of the network that only belongs to the policy network
                value_only_layers = [
                ]  # Layer sizes of the network that only belongs to the value network

                # Iterate through the shared layers and build the shared parts of the network
                lstm_layer_constructed = False
                for idx, layer in enumerate(net_arch):
                    if isinstance(layer,
                                  int):  # Check that this is a shared layer
                        layer_size = layer
                        latent = act_fun(
                            linear(latent,
                                   "shared_fc{}".format(idx),
                                   layer_size,
                                   init_scale=np.sqrt(2)))
                    elif layer == "lstm":
                        if lstm_layer_constructed:
                            raise ValueError(
                                "The net_arch parameter must only contain one occurrence of 'lstm'!"
                            )
                        input_sequence = batch_to_seq(latent, self.n_env,
                                                      n_steps)
                        masks = batch_to_seq(self.dones_ph, self.n_env,
                                             n_steps)
                        rnn_output, self.snew = lstm(input_sequence,
                                                     masks,
                                                     self.states_ph,
                                                     'lstm1',
                                                     n_hidden=n_lstm,
                                                     layer_norm=layer_norm)
                        latent = seq_to_batch(rnn_output)
                        lstm_layer_constructed = True
                    else:
                        assert isinstance(
                            layer, dict
                        ), "Error: the net_arch list can only contain ints and dicts"
                        if 'pi' in layer:
                            assert isinstance(
                                layer['pi'], list
                            ), "Error: net_arch[-1]['pi'] must contain a list of integers."
                            policy_only_layers = layer['pi']

                        if 'vf' in layer:
                            assert isinstance(
                                layer['vf'], list
                            ), "Error: net_arch[-1]['vf'] must contain a list of integers."
                            value_only_layers = layer['vf']
                        break  # From here on the network splits up in policy and value network

                # Build the non-shared part of the policy-network
                latent_policy = latent
                for idx, pi_layer_size in enumerate(policy_only_layers):
                    if pi_layer_size == "lstm":
                        raise NotImplementedError(
                            "LSTMs are only supported in the shared part of the policy network."
                        )
                    assert isinstance(
                        pi_layer_size, int
                    ), "Error: net_arch[-1]['pi'] must only contain integers."
                    latent_policy = act_fun(
                        linear(latent_policy,
                               "pi_fc{}".format(idx),
                               pi_layer_size,
                               init_scale=np.sqrt(2)))

                # Build the non-shared part of the value-network
                latent_value = latent
                for idx, vf_layer_size in enumerate(value_only_layers):
                    if vf_layer_size == "lstm":
                        raise NotImplementedError(
                            "LSTMs are only supported in the shared part of the value function "
                            "network.")
                    assert isinstance(
                        vf_layer_size, int
                    ), "Error: net_arch[-1]['vf'] must only contain integers."
                    latent_value = act_fun(
                        linear(latent_value,
                               "vf_fc{}".format(idx),
                               vf_layer_size,
                               init_scale=np.sqrt(2)))

                if not lstm_layer_constructed:
                    raise ValueError(
                        "The net_arch parameter must contain at least one occurrence of 'lstm'!"
                    )

                self._value_fn = linear(latent_value, 'vf', 1)
                # TODO: why not init_scale = 0.001 here like in the feedforward
                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(latent_policy, latent_value)
        self._setup_init()
예제 #8
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 n_lstm=256,
                 reuse=False,
                 layers=None,
                 cnn_extractor=custom_cnn,
                 layer_norm=False,
                 feature_extraction="cnn",
                 **kwargs):
        # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
        #                                  scale=(feature_extraction == "cnn"))
        # add this function to LstmPolicy to init ActorCriticPolicy
        self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm,
                     reuse, feature_extraction)

        with tf.variable_scope("model", reuse=reuse):
            extracted_features = cnn_extractor(self.processed_x,
                                               **kwargs)  # vectors v_t
            print('extracted_features', extracted_features)
            last_num_height = extracted_features.shape[1]
            last_num_width = extracted_features.shape[2]
            # print(last_width)
            last_num_features = extracted_features.shape[3]
            n_hiddens = 42

            x2 = tf.reshape(
                extracted_features,
                [-1, last_num_height * last_num_width, last_num_features])
            print('x2', x2)
            x3 = tf.nn.relu(
                conv(extracted_features,
                     'x3',
                     n_filters=n_hiddens,
                     filter_size=1,
                     stride=1,
                     init_scale=np.sqrt(2),
                     **kwargs))
            print('x3', x3)

            print('states', self.states_ph)
            # ob = [envs,steps] -- rnn_state = [envs]*steps
            h0 = tf.expand_dims(self.states_ph, 1)
            h0 = tf.tile(h0, [1, self.n_steps, 1])
            print('h0', h0)
            h0 = tf.reshape(h0, [-1, h0.shape[2]])
            print('h0', h0)
            h1 = linear_without_bias(h0,
                                     'fc_h1',
                                     n_hidden=n_hiddens,
                                     init_scale=np.sqrt(2))
            print('h1', h1)
            # replicate [1,n_hiddens] to [1,22*16,n_hiddens]
            h2 = tf.expand_dims(h1, 1)
            h2 = tf.tile(h2, [1, last_num_height * last_num_width, 1])
            print('h2', h2)

            h3 = tf.reshape(h2,
                            [-1, last_num_height, last_num_width, n_hiddens])
            print('h3', h3)

            a1 = tf.nn.tanh(tf.add(h3, x3))
            a2 = tf.nn.relu(
                conv(a1,
                     'a2',
                     n_filters=1,
                     filter_size=1,
                     stride=1,
                     init_scale=np.sqrt(2),
                     **kwargs))
            print('a2', a2)

            a3 = tf.nn.softmax(
                tf.reshape(a2,
                           [-1, last_num_height * last_num_width]))  # attetion
            print('a3', a3)
            self.attention = a3

            a4 = tf.expand_dims(a3, 2)
            a4 = tf.tile(a4, [1, 1, last_num_features])
            print('a4', a4)

            context = tf.reduce_sum(tf.multiply(a4, x2), 2)
            print('context', context)
            input_sequence = batch_to_seq(context, self.n_env, n_steps)
            # input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=layer_norm)

            rnn_output = seq_to_batch(rnn_output)
            # print('rnn_output', rnn_output, '      snew', self.snew)

            value_fn = linear(rnn_output, 'vf', 1)

            self.proba_distribution, self.policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

        self.value_fn = value_fn
        self.initial_state = np.zeros((self.n_env, n_lstm * 2),
                                      dtype=np.float32)
        self._setup_init()
예제 #9
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=4, reuse=False, layers=None,
                 net_arch=None, act_fun=tf.tanh, cnn_extractor=custom_cnn, layer_norm=True, feature_extraction="cnn", params=None,
                 **kwargs):
        super(CustomCnnLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch,
                                         state_shape=(2 * n_lstm, ), reuse=reuse,
                                         scale=(feature_extraction == "cnn"))
        config = params
        init_scale = params['pd_init_scale']
        activ = getattr(tf.nn, params['activ'])
        initializer = getattr(tf, params['kernel_initializer'])
        self._kwargs_check(feature_extraction, kwargs)
        net_arch = config['shared']
        net_arch.append(dict(pi=config['h_actor'],
                             vf=config['h_critic']))

        if net_arch is None:  # Legacy mode
            if layers is None:
                layers = [64, 64]

            with tf.variable_scope("model", reuse=reuse):
                if feature_extraction == "cnn":
                    extracted_features = cnn_extractor(self.processed_obs, params)
                else:
                    extracted_features = tf.layers.flatten(self.processed_obs)
                    for i, layer_size in enumerate(layers):
                        extracted_features = act_fun(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size,
                                                            init_scale=np.sqrt(2)))
                input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
                masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output = seq_to_batch(rnn_output)
                value_fn = linear(rnn_output, 'vf', 1)

                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

            self._value_fn = value_fn
        else:  # Use the new net_arch parameter

            with tf.variable_scope("model", reuse=reuse):
                extracted_features = cnn_extractor(self.processed_obs, params)
                
                latent = tf.layers.flatten(extracted_features)
                policy_only_layers = [] 
                value_only_layers = [] 

                lstm_layer_constructed = False
                for idx, layer in enumerate(net_arch):
                    if isinstance(layer, int):
                        layer_size = layer
                        latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2)))
                    elif layer == "lstm":
                        if lstm_layer_constructed:
                            raise ValueError("The net_arch parameter must only contain one occurrence of 'lstm'!")
                        input_sequence = batch_to_seq(latent, self.n_env, n_steps)
                        masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                        rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                                     layer_norm=layer_norm)
                        latent = seq_to_batch(rnn_output)
                        lstm_layer_constructed = True
                    else:
                        assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts"
                        if 'pi' in layer:
                            assert isinstance(layer['pi'],
                                              list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                            policy_only_layers = layer['pi']

                        if 'vf' in layer:
                            assert isinstance(layer['vf'],
                                              list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                            value_only_layers = layer['vf']
                        break 

                latent_policy = latent
                for idx, pi_layer_size in enumerate(policy_only_layers):
                    if pi_layer_size == "lstm":
                        raise NotImplementedError("LSTMs are only supported in the shared part of the policy network.")
                    assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
                    latent_policy = act_fun(
                        linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2)))

                latent_value = latent
                for idx, vf_layer_size in enumerate(value_only_layers):
                    if vf_layer_size == "lstm":
                        raise NotImplementedError("LSTMs are only supported in the shared part of the value function "
                                                  "network.")
                    assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
                    latent_value = act_fun(
                        linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2)))

                if not lstm_layer_constructed:
                    raise ValueError("The net_arch parameter must contain at least one occurrence of 'lstm'!")

                self._value_fn = linear(latent_value, 'vf', 1)
                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(latent_policy, latent_value)
        self._setup_init()
예제 #10
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 n_lstm=150,
                 reuse=False,
                 layers=None,
                 goal_num=1,
                 goal_net_arch=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 layer_norm=False,
                 goal_encoder='mlp',
                 feature_extraction="mlp",
                 **kwargs):
        # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM
        super(GoalsConditionedLSTMPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             state_shape=(2 * n_lstm, ),
                             reuse=reuse,
                             scale=(feature_extraction == "mlp"))

        self.goal_encoder = goal_encoder
        # self._kwargs_check(feature_extraction, kwargs)
        self.name = "lstm_policy_" + goal_encoder

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            self.obs_goals = tf.placeholder(dtype=ob_space.dtype,
                                            shape=(None, ob_space.shape[0]),
                                            name='goal_states')
            obs_goals_reshape = self.obs_goals  #tf.reshape(tensor=self.obs_goals, shape=(-1, self.goal_num * ob_space.shape[0]))

            if goal_encoder == "mlp_sample":
                logging.info('mlp encoder with z sampling')
                self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder(
                    obs_goals_reshape, goal_net_arch, act_fun)
                eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq),
                                       mean=0,
                                       stddev=1,
                                       dtype=tf.float32)
                self.z_goal_sample = self.z_mu + tf.sqrt(
                    tf.exp(self.z_log_sigma_sq)) * eps
            if goal_encoder == "mlp":
                logging.info('mlp encoder with z mu')
                self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder(
                    obs_goals_reshape, goal_net_arch, act_fun)
                self.z_goal_sample = self.z_mu
            if goal_encoder == "no_encoder" or goal_encoder == 'no_goal_proposing':
                self.z_goal_sample = tf.stop_gradient(self.obs_goals)

            self.z_goal_input = tf.placeholder(dtype=ob_space.dtype,
                                               shape=self.z_goal_sample.shape,
                                               name='input_z_goal')
            self.use_input_z = tf.placeholder_with_default(False,
                                                           shape=(),
                                                           name='use_input_z')

            def use_sample():
                return self.z_goal_sample

            def use_input():
                return self.z_goal_input

            self.z_goal = tf.cond(self.use_input_z, use_input, use_sample)

            if goal_encoder == 'no_goal_proposing':
                latent = tf.layers.flatten(self.processed_obs)
            else:
                latent = tf.concat(
                    [tf.layers.flatten(self.processed_obs), self.z_goal], 1)
            logging.info('latent shape %f' % latent.shape)

            if net_arch is None:  # Legacy mode
                if layers is None:
                    layers = [64, 64]
                else:
                    warnings.warn(
                        "The layers parameter is deprecated. Use the net_arch parameter instead."
                    )

                if feature_extraction == "cnn":
                    extracted_features = cnn_extractor(self.processed_obs,
                                                       **kwargs)
                else:
                    extracted_features = latent  #tf.layers.flatten(self.processed_obs)
                    for i, layer_size in enumerate(layers):
                        extracted_features = act_fun(
                            linear(extracted_features,
                                   'pi_fc' + str(i),
                                   n_hidden=layer_size,
                                   init_scale=np.sqrt(2)))
                input_sequence = batch_to_seq(extracted_features, self.n_env,
                                              n_steps)
                masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                rnn_output, self.snew = lstm(input_sequence,
                                             masks,
                                             self.states_ph,
                                             'lstm1',
                                             n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output = seq_to_batch(rnn_output)
                value_fn = linear(rnn_output, 'vf', 1)

                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

                self._value_fn = value_fn
            else:  # Use the new net_arch parameter
                if layers is not None:
                    warnings.warn(
                        "The new net_arch parameter overrides the deprecated layers parameter."
                    )
                if feature_extraction == "cnn":
                    raise NotImplementedError()

                # latent = tf.layers.flatten(self.processed_obs)
                policy_only_layers = [
                ]  # Layer sizes of the network that only belongs to the policy network
                value_only_layers = [
                ]  # Layer sizes of the network that only belongs to the value network

                # Iterate through the shared layers and build the shared parts of the network
                lstm_layer_constructed = False
                for idx, layer in enumerate(net_arch):
                    if isinstance(layer,
                                  int):  # Check that this is a shared layer
                        layer_size = layer
                        latent = act_fun(
                            linear(latent,
                                   "shared_fc{}".format(idx),
                                   layer_size,
                                   init_scale=np.sqrt(2)))
                    elif layer == "lstm":
                        if lstm_layer_constructed:
                            raise ValueError(
                                "The net_arch parameter must only contain one occurrence of 'lstm'!"
                            )
                        input_sequence = batch_to_seq(latent, self.n_env,
                                                      n_steps)
                        masks = batch_to_seq(self.dones_ph, self.n_env,
                                             n_steps)
                        rnn_output, self.snew = lstm(input_sequence,
                                                     masks,
                                                     self.states_ph,
                                                     'lstm1',
                                                     n_hidden=n_lstm,
                                                     layer_norm=layer_norm)
                        latent = seq_to_batch(rnn_output)
                        lstm_layer_constructed = True
                    else:
                        assert isinstance(
                            layer, dict
                        ), "Error: the net_arch list can only contain ints and dicts"
                        if 'pi' in layer:
                            assert isinstance(
                                layer['pi'], list
                            ), "Error: net_arch[-1]['pi'] must contain a list of integers."
                            policy_only_layers = layer['pi']

                        if 'vf' in layer:
                            assert isinstance(
                                layer['vf'], list
                            ), "Error: net_arch[-1]['vf'] must contain a list of integers."
                            value_only_layers = layer['vf']
                        break  # From here on the network splits up in policy and value network

                # Build the non-shared part of the policy-network
                latent_policy = latent
                for idx, pi_layer_size in enumerate(policy_only_layers):
                    if pi_layer_size == "lstm":
                        raise NotImplementedError(
                            "LSTMs are only supported in the shared part of the policy network."
                        )
                    assert isinstance(
                        pi_layer_size, int
                    ), "Error: net_arch[-1]['pi'] must only contain integers."
                    latent_policy = act_fun(
                        linear(latent_policy,
                               "pi_fc{}".format(idx),
                               pi_layer_size,
                               init_scale=np.sqrt(2)))

                # Build the non-shared part of the value-network
                latent_value = latent
                for idx, vf_layer_size in enumerate(value_only_layers):
                    if vf_layer_size == "lstm":
                        raise NotImplementedError(
                            "LSTMs are only supported in the shared part of the value function "
                            "network.")
                    assert isinstance(
                        vf_layer_size, int
                    ), "Error: net_arch[-1]['vf'] must only contain integers."
                    latent_value = act_fun(
                        linear(latent_value,
                               "vf_fc{}".format(idx),
                               vf_layer_size,
                               init_scale=np.sqrt(2)))

                if not lstm_layer_constructed:
                    raise ValueError(
                        "The net_arch parameter must contain at least one occurrence of 'lstm'!"
                    )

                self._value_fn = linear(latent_value, 'vf', 1)
                # TODO: why not init_scale = 0.001 here like in the feedforward
                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(latent_policy, latent_value)

        if goal_encoder == "mlp_sample":
            kl_coef = 0.01
            latent_loss = -0.5 * tf.reduce_sum(
                1 + self.z_log_sigma_sq - tf.square(self.z_mu) -
                tf.exp(self.z_log_sigma_sq),
                axis=1)

            self.latent_loss = tf.reduce_mean(latent_loss) * kl_coef
        else:
            self.latent_loss = 0

        self._setup_init()
예제 #11
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        super(ErrorPredRnnPolicy,
              self).__init__(ob_space, ac_space, hidsize, ob_mean, ob_std,
                             feat_dim, layernormalize, nl, n_env, n_steps,
                             reuse, n_lstm, scope)
        with tf.variable_scope(scope):
            self.flat_masks_ph = tf.reshape(self.masks_ph,
                                            [self.n_env * self.n_steps])
            self.pred_error = tf.placeholder(
                dtype=tf.float32,
                shape=(self.n_env, self.n_steps, self.hidsize),
                name='pred_error')  # prediction error
            self.flat_pred_error = flatten_two_dims(self.pred_error)

            self.obs_pred = tf.placeholder(dtype=tf.float32,
                                           shape=(self.n_env, self.n_steps,
                                                  self.hidsize),
                                           name='obs_pred')
            self.flat_obs_pred = flatten_two_dims(self.obs_pred)

            with tf.variable_scope(scope, reuse=self.reuse):
                x = tf.concat([
                    self.flat_features, self.flat_obs_pred,
                    self.flat_pred_error
                ],
                              axis=1)

                input_sequence = batch_to_seq(x, self.n_env, self.n_steps)
                masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps)
                rnn_output, self.snew = lstm(input_sequence,
                                             masks,
                                             self.states_ph,
                                             'lstm1',
                                             n_hidden=n_lstm,
                                             layer_norm=False)
                rnn_output = seq_to_batch(rnn_output)
                rnn_output = layernorm(rnn_output)

                ## Concat
                q = self.flat_features
                q = tf.concat([q, rnn_output], axis=1)
                q = fc(q, units=hidsize, activation=activ, name="fc1")
                q = fc(q, units=hidsize, activation=activ, name="fc2")

                pdparam, vpred = self.get_pdparam(q)
            self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh)
            self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
예제 #12
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 n_lstm=256,
                 reuse=False,
                 layers=None,
                 cnn_extractor=nature_cnn,
                 layer_norm=False,
                 feature_extraction="cnn",
                 **kwargs):
        # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
        #                                  scale=(feature_extraction == "cnn"))
        # add this function to LstmPolicy to init ActorCriticPolicy
        self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm,
                     reuse, feature_extraction)

        with tf.variable_scope("model", reuse=reuse):
            extracted_features = cnn_extractor(self.processed_x,
                                               **kwargs)  # # [B,H,W,Deepth]
            print('extracted_features', extracted_features)
            coor = get_coor(extracted_features)
            # [B,Height,W,D+2]
            entities = tf.concat([extracted_features, coor], axis=3)
            print('entities:', entities)
            # [B,H*W,num_heads,Deepth=D+2]
            MHDPA_output, weights = MHDPA(entities,
                                          "extracted_features",
                                          num_heads=2)
            print('MHDPA_output', MHDPA_output)
            self.attention = weights
            # [B,H*W,num_heads,Deepth]
            residual_output = residual_block(entities, MHDPA_output)
            print('residual_output', residual_output)

            # max_pooling
            residual_maxpooling_output = tf.reduce_max(residual_output,
                                                       axis=[1])

            print('residual_maxpooling_output', residual_maxpooling_output)
            input_sequence = batch_to_seq(residual_maxpooling_output,
                                          self.n_env, n_steps)
            # input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=layer_norm)

            rnn_output = seq_to_batch(rnn_output)
            # print('rnn_output', rnn_output, '      snew', self.snew)

            value_fn = linear(rnn_output, 'vf', 1)

            self.proba_distribution, self.policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

        self.value_fn = value_fn
        self.initial_state = np.zeros((self.n_env, n_lstm * 2),
                                      dtype=np.float32)
        self._setup_init()
예제 #13
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 n_lstm=128,
                 reuse=False,
                 layers=None,
                 layer_norm=False,
                 feature_extraction="not cnn",
                 **kwargs):
        # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
        #                                  scale=(feature_extraction == "cnn"))
        # add this function to LstmPolicy to init ActorCriticPolicy
        self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm,
                     reuse, feature_extraction)

        with tf.variable_scope("model", reuse=reuse):
            print('self.processed_x', self.processed_x)
            batch_size = self.n_env * self.n_steps
            past_frame_num = 8
            activ = tf.nn.relu
            n_hiddens = 128

            # throughput, download_time, chunk_size, buffer_size, last_bit_rate, rebuf, play_time_len, end_delay

            throughput = self.processed_x[:, :past_frame_num]
            throughput = tf.reshape(throughput, [-1, past_frame_num, 1])
            throughput_conv = activ(
                conv1d(throughput, scope='throughput_conv1d'))

            download_time = self.processed_x[:,
                                             past_frame_num:2 * past_frame_num]
            download_time = tf.reshape(download_time, [-1, past_frame_num, 1])
            download_time_conv = activ(
                conv1d(download_time, scope='download_time_conv1d'))

            chunk_size = self.processed_x[:, 2 *
                                          past_frame_num:2 * past_frame_num +
                                          4]
            chunk_size = tf.reshape(chunk_size, [-1, 4, 1])
            chunk_size_conv = activ(
                conv1d(chunk_size, scope='chunk_size_conv1d'))

            buffer_size = self.processed_x[:, 2 * past_frame_num +
                                           4:2 * past_frame_num + 5]
            buffer_size_dense = activ(
                linear(buffer_size,
                       scope='buffer_size_dense',
                       n_hidden=n_hiddens))

            last_bit_rate = self.processed_x[:, 2 * past_frame_num +
                                             5:2 * past_frame_num + 6]
            last_bit_rate_dense = activ(
                linear(last_bit_rate,
                       scope='last_bit_rate_dense',
                       n_hidden=n_hiddens))

            end_delay = self.processed_x[:, 2 * past_frame_num +
                                         6:2 * past_frame_num + 7]
            end_delay_dense = activ(
                linear(end_delay, scope='end_delay_dense', n_hidden=n_hiddens))

            input = [
                tf.reshape(throughput_conv, [batch_size, -1]),
                tf.reshape(download_time_conv, [batch_size, -1]),
                tf.reshape(chunk_size_conv, [batch_size, -1]),
                buffer_size_dense, last_bit_rate_dense, end_delay_dense
            ]

            input = tf.concat(input, axis=1)
            print('input', input)
            input_sequence = batch_to_seq(input, self.n_env, n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=layer_norm)
            rnn_output = seq_to_batch(rnn_output)
            value_fn = linear(rnn_output, 'vf', 1)

            self.proba_distribution, self.policy, self.q_value = self.pdtype.proba_distribution_from_latent(
                rnn_output, rnn_output)

        self.value_fn = value_fn
        self.initial_state = np.zeros((self.n_env, n_lstm * 2),
                                      dtype=np.float32)
        self._setup_init()