예제 #1
0
 def step_logits(self, obs):
     fetches = {
         'a':
         nest.map_structure_up_to(self._ac_structure, lambda head: head.sam,
                                  self.net_out.self_fed_heads),
         'logits':
         nest.map_structure_up_to(self._ac_structure,
                                  lambda head: head.logits,
                                  self.net_out.self_fed_heads)
     }
     ret = self._forward(obs, fetches)
     return _squeeze_batch_size_singleton_dim(ret['a']), ret['logits']
예제 #2
0
    def setup_fetches(self, outputs):
        def split_batch(template, tf_structure):
            split_flatten = zip(*[
                tf.split(t, self.batch_size)
                for t in nest.flatten_up_to(template, tf_structure)
            ])
            return [
                nest.pack_sequence_as(template, flatten)
                for flatten in split_flatten
            ]

        if self.nc.use_self_fed_heads:
            a = nest.map_structure_up_to(self._ac_structure,
                                         lambda head: head.sam,
                                         self.net_out.self_fed_heads)
            neglogp = nest.map_structure_up_to(self._ac_structure,
                                               lambda head: head.neglogp,
                                               self.net_out.self_fed_heads)
            flatparam = nest.map_structure_up_to(self._ac_structure,
                                                 lambda head: head.flatparam,
                                                 self.net_out.self_fed_heads)
            self.all_outputs = {
                'a':
                split_batch(self._ac_structure, a),
                'neglogp':
                split_batch(self._ac_structure, neglogp),
                'flatparam':
                split_batch(self._ac_structure, flatparam),
                'v':
                tf.split(self.net_out.value_head, self.batch_size)
                if self.net_out.value_head is not None else [[]] *
                self.batch_size,
                'state':
                tf.split(self.net_out.S, self.batch_size)
                if self.net_out.S is not None else [[]] * self.batch_size
            }
        else:
            flatparam = nest.map_structure_up_to(self._ac_structure,
                                                 lambda head: head.flatparam,
                                                 self.net_out.outer_fed_heads)
            self.all_outputs = {
                'flatparam':
                split_batch(self._ac_structure, flatparam),
                'state':
                tf.split(self.net_out.S, self.batch_size)
                if self.net_out.S is not None else [[]] * self.batch_size
            }
        if self.nc.use_lstm and 'state' not in outputs:
            outputs.append('state')
        self.fetches = [
            dict(zip(outputs, pred))
            for pred in zip(*[self.all_outputs[o] for o in outputs])
        ]
예제 #3
0
    def debatch_timestep(self, ts):
        """Debatches a single timestep.
    Returns bs length of timesteps."""

        traj_spec = self._traj_spec

        def f(arr):
            if arr is None:
                return arr
            l = np.split(arr, len(arr))
            # remove the leading dimension
            l = list(map(functools.partial(np.squeeze, axis=0), l))
            return l

        # split along the batch dimension
        d = nest.map_structure_up_to(traj_spec, f, ts)

        # determine the batch size
        lens = [
            len(v) for v in filter(lambda k: k is not None,
                                   nest.flatten_up_to(traj_spec, d))
        ]
        bs = lens[0]
        assert all(x == bs for x in lens)

        # Flatten and replicate by packing the sequence bs times.
        d = nest.flatten_up_to(traj_spec, d)

        l = []
        for i in range(bs):
            l.append(
                nest.pack_sequence_as(
                    traj_spec, list(map(lambda k: k
                                        if k is None else k[i], d))))
        return l
예제 #4
0
def gym_ddpg_actor_test():
    mycfg = {
        'test': False,
        'use_loss_type': 'none',
        'use_value_head': False,
        'n_v': 4,
        'use_lstm': True,
        'batch_size': 1,
        'rollout_len': 1,
        'nlstm': 64,
        'hs_len': 64 * 2,
        'lstm_layer_norm': True,
        'weight_decay': 0.0005
    }

    ob_space = spaces.Box(shape=(11, ), dtype=np.float32, low=0, high=1)
    ac_space = spaces.Box(shape=(2, ), low=-1.0, high=1.0, dtype=np.float32)

    nc = net_config_cls(ob_space, ac_space, **mycfg)
    inputs = net_inputs_placeholders_fun(nc)
    out = net_build_fun(inputs, nc, scope='gym_ddpg')
    sample = ob_space.sample()
    sess = tf.Session()
    tf.global_variables_initializer().run(session=sess)
    feed_dict = {inputs.X: [sample]}
    feed_dict[inputs.S] = np.zeros(shape=[1, nc.hs_len])
    feed_dict[inputs.M] = np.zeros(shape=[1])
    from tensorflow.contrib.framework import nest
    import tpolicies.tp_utils as tp_utils
    ac_structure = tp_utils.template_structure_from_gym_space(ac_space)
    a = nest.map_structure_up_to(ac_structure, lambda head: head.sam,
                                 out.self_fed_heads)
    sam = sess.run(a, feed_dict=feed_dict)
    print(sam)
    pass
예제 #5
0
 def forward_squeezed(self, obs):
   if self.infserver_addr is None:
     # prepare fetches dict
     fetches = {
       'a': nest.map_structure_up_to(self._ac_structure, lambda head: head.sam,
                                     self.net_out.self_fed_heads),
       'neglogp': nest.map_structure_up_to(self._ac_structure,
                                           lambda head: head.neglogp,
                                           self.net_out.self_fed_heads),
       'v': self.net_out.value_head if self.net_out.value_head is not None else []
     }
   else:
     fetches = None
   ret = self._forward(obs, fetches=fetches)
   ret['state'] = self._last_state
   return ret.pop('a'), ret
예제 #6
0
 def step(self, obs):
     # prepare fetches dict
     fetches = {
         'a':
         nest.map_structure_up_to(self._ac_structure, lambda head: head.sam,
                                  self.net_out.self_fed_heads),
     }
     ret = self._forward(obs, fetches)
     return _squeeze_batch_size_singleton_dim(ret['a'])
예제 #7
0
    def _stack(trajs, traj_spec):
        stacked_trajs = []

        def f(spec, *l):
            l = list(filter(lambda k: k is not None, l))
            # copy leads to crazy cpu util
            return np.stack(l, axis=0).astype(spec.dtype, copy=False)

        return nest.map_structure_up_to(traj_spec, f, traj_spec, *trajs)
예제 #8
0
    def batch(trajs, traj_spec):
        batched_trajs = Trajectory._stack(trajs, traj_spec)

        def f(l):
            # make time major
            return None if l is None else np.swapaxes(l, 0, 1)

        batched_trajs = nest.map_structure_up_to(traj_spec, f, batched_trajs)
        return batched_trajs
예제 #9
0
 def step(self, obs):
   if self.infserver_addr is None:
     # prepare fetches dict
     fetches = {
       'a': nest.map_structure_up_to(self._ac_structure, lambda head: head.sam,
                                     self.net_out.self_fed_heads),
     }
   else:
     fetches = None
   ret = self._forward(obs, fetches=fetches)
   return ret['a']
예제 #10
0
    def _stack_ts(self, timesteps):
        """Should be called after _make_step_spec."""

        dict_tss = []
        for ts in timesteps:
            dict_tss.append(dict(ts._asdict()))

        def f(spec, *l):
            return np.stack(l, axis=0).astype(spec.dtype)

        stacked_ts = nest.map_structure_up_to(self._step_spec, f,
                                              self._step_spec, *dict_tss)
        return TimeStep(**stacked_ts)
예제 #11
0
 def forward_squeezed(self, obs):
     # prepare fetches dict
     fetches = {
         'a':
         nest.map_structure_up_to(self._ac_structure, lambda head: head.sam,
                                  self.net_out.self_fed_heads),
         'neglogp':
         nest.map_structure_up_to(self._ac_structure,
                                  lambda head: head.neglogp,
                                  self.net_out.self_fed_heads),
         # 'logits': nest.map_structure_up_to(self._ac_structure,
         #                                    lambda head: head.logits,
         #                                    self.net_out.self_fed_heads),
         'v':
         self.net_out.value_head
         if self.net_out.value_head is not None else []
     }
     ret = self._forward(obs, fetches)
     a = _squeeze_batch_size_singleton_dim(ret['a'])
     v = _squeeze_batch_size_singleton_dim(ret['v'])
     neglogp = _squeeze_batch_size_singleton_dim(ret['neglogp'])
     # logits = _squeeze_batch_size_singleton_dim(ret['logits'])
     return a, v, self._last_state, neglogp
예제 #12
0
 def head_param(self, obs, action=None):
   if self.infserver_addr is None:
     if action is None:
       assert self.net_out.self_fed_heads is not None
       heads = self.net_out.self_fed_heads
     else:
       assert self.net_out.outer_fed_heads is not None
       heads = self.net_out.outer_fed_heads
     fetches = {'flatparam': nest.map_structure_up_to(
       self._ac_structure, lambda head: head.flatparam, heads)}
   else:
     fetches = None
   ret = self._forward(obs, fetches, action)
   return ret['flatparam']
예제 #13
0
 def logits(self, obs, action=None):
     if action is None:
         assert self.net_out.self_fed_heads is not None
         heads = self.net_out.self_fed_heads
     else:
         assert self.net_out.outer_fed_heads is not None
         heads = self.net_out.outer_fed_heads
     fetches = {
         'logits':
         nest.map_structure_up_to(self._ac_structure,
                                  lambda head: head.logits, heads)
     }
     ret = self._forward(obs, fetches, action)
     return _squeeze_batch_size_singleton_dim(ret['logits'])
예제 #14
0
def conv_lstm_actor_test():
    mycfg = {
        'test': False,
        'use_loss_type': 'none',
        'use_value_head': False,
        'n_v': 4,
        'sync_statistics': None,
        'use_lstm': True,
        'batch_size': 1,
        'rollout_len': 1,
        'nlstm': 64,
        'hs_len': 64 * 2,
        'lstm_layer_norm': True,
        'weight_decay': 0.0005
    }

    ob_space = spaces.Tuple([
        spaces.Tuple([
            spaces.Box(shape=(11, 11, 22), dtype=np.float32, low=0, high=1),
            spaces.Box(shape=(2, ), dtype=np.int32, low=0, high=10),
            spaces.Box(shape=[6], dtype=np.bool, low=0, high=1)
        ])
    ] * 2)
    ac_space = spaces.Tuple([spaces.Discrete(n=6)] * 2)

    nc = net_config_cls(ob_space, ac_space, **mycfg)
    inputs = net_inputs_placeholders_fun(nc)
    out = net_build_fun(inputs, nc, scope='conv_lstm')
    sample = ob_space.sample()
    sess = tf.Session()
    tf.global_variables_initializer().run(session=sess)
    feed_dict = {}
    for s, input in zip(sample, inputs.X):
        for x_np, x in zip(s, input):
            feed_dict[x] = [x_np]
    feed_dict[inputs.S] = np.zeros(shape=[1, nc.hs_len])
    feed_dict[inputs.M] = np.zeros(shape=[1])
    from tensorflow.contrib.framework import nest
    import tpolicies.tp_utils as tp_utils
    ac_structure = tp_utils.template_structure_from_gym_space(ac_space)
    a = nest.map_structure_up_to(ac_structure, lambda head: head.sam,
                                 out.self_fed_heads)
    sam = sess.run(a, feed_dict=feed_dict)
    print(sam)
    pass
예제 #15
0
    def debatch_and_stack(self):
        """Remove the leading batch dimension and then stack on timestamp.
        Returns list of stacked timesteps for each batch."""
        traj_spec = self._traj_spec

        def f(arr):
            if arr is None:
                return arr
            l = np.split(arr, len(arr))
            # remove the leading dimension
            l = list(map(functools.partial(np.squeeze, axis=0), l))
            return l

        l = []
        for traj in self._trajs:
            # split along the batch dimension
            d = nest.map_structure_up_to(traj_spec, f, traj)

            # determine the batch size
            lens = [
                len(v) for v in filter(lambda k: k is not None,
                                       nest.flatten_up_to(traj_spec, d))
            ]
            bs = lens[0]
            assert all(x == bs for x in lens)

            # Flatten and replicate by packing the sequence bs times.
            d = nest.flatten_up_to(traj_spec, d)
            if not l:
                l = [[] for _ in range(bs)]

            for i in range(bs):
                l[i].append(
                    nest.pack_sequence_as(
                        traj_spec,
                        list(map(lambda k: k if k is None else k[i], d))))

        return list(
            map(
                functools.partial(Trajectory._stack,
                                  traj_spec=self._traj_spec), l))
예제 #16
0
def conv_lstm(inputs: ConvLstmInputs,
              nc: ConvLstmConfig,
              scope=None) -> ConvLstmOutputs:
    """create the whole net for conv-lstm"""
    with tf.variable_scope(scope, default_name='pommerman') as sc:
        # NOTE: use name_scope, in case multiple parameter-sharing nets are built
        net_name_scope = tf.get_default_graph().get_name_scope()
        endpoints_collections = net_name_scope + '_endpoints'
        X = inputs.X
        if nc.n_player == 1:
            X = (X, )
            ac_spaces = (nc.ac_space, )
        else:
            ac_spaces = tuple(nc.ac_space.spaces)
        S = tf.split(inputs.S, nc.n_player, axis=1)
        # make body
        y = []
        hs_new = []
        heads = []
        if nc.use_lstm and nc.n_player > 1:
            nc.hs_len //= nc.n_player
            nc.nlstm //= nc.n_player
        for input, s, ac_space in zip(X, S, ac_spaces):
            with tf.variable_scope('body', reuse=tf.AUTO_REUSE):
                x = tfc_layers.conv2d(input[0],
                                      nc.spa_ch_dim, [3, 3],
                                      scope='conv0')
                x = tfc_layers.conv2d(x, nc.spa_ch_dim, [5, 5], scope='conv1')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [3, 3],
                                      scope='conv2')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [5, 5],
                                      scope='conv3')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 4, [3, 3],
                                      scope='conv4')
                pos = tf.to_int32(input[1])
                ind = tf.concat(
                    [tf.expand_dims(tf.range(nc.batch_size), 1), pos], axis=1)
                x = tf.gather_nd(x, ind)
                if nc.use_lstm:
                    with tf.variable_scope('lstm_embed'):
                        x, hs = tp_layers.lstm_embed_block(
                            inputs_x=x,
                            inputs_hs=s,
                            inputs_mask=inputs.M,
                            nc=nc)
                        hs_new.append(hs)
                y.append(x)

            # make action head
            with tf.variable_scope('action', reuse=tf.AUTO_REUSE):
                head_logits = tfc_layers.fully_connected(x,
                                                         ac_space.n,
                                                         activation_fn=None,
                                                         normalizer_fn=None,
                                                         scope='logits')
                if len(input) > 1:
                    head_logits = tp_ops.mask_logits(head_logits, input[2])
                head = tp_layers.to_action_head(head_logits, CategoricalPdType)
                heads.append(head)

        if nc.use_lstm:
            hs_new = tf.concat(hs_new, axis=1)
            if nc.n_player > 1:
                nc.hs_len *= nc.n_player
                nc.nlstm *= nc.n_player
        y = tf.concat(y, axis=1)
        heads = tp_utils.pack_sequence_as_structure_like_gym_space(
            nc.ac_space, heads)
        if nc.n_player == 1:
            heads = heads[0]
        # make value head
        vf = None
        if nc.use_value_head:
            assert nc.n_player == 2
            with tf.variable_scope('vf'):
                vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4)
                vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2)
                vf = tfc_layers.fully_connected(vf,
                                                nc.n_v,
                                                activation_fn=None,
                                                normalizer_fn=None)
        # make loss
        loss = None
        if nc.use_loss_type in ['rl', 'rl_ppo', 'rl_vtrace']:
            assert nc.n_player == 2
            with tf.variable_scope('losses'):
                # regularization loss
                total_reg_loss = tf.losses.get_regularization_losses(
                    scope=sc.name)
                # entropy loss
                entropy_loss = nest.map_structure_up_to(
                    ac_spaces, lambda head: tf.reduce_mean(head.ent), heads)
                # ppo loss
                neglogp = nest.map_structure_up_to(
                    ac_spaces, lambda head, ac: head.pd.neglogp(ac), heads,
                    inputs.A)
                loss_endpoints = {}
                for k, v in enumerate(entropy_loss):
                    loss_endpoints['ent_' + str(k)] = v
                if nc.use_loss_type == 'rl' or nc.use_loss_type == 'rl_ppo':
                    pg_loss, value_loss = tp_losses.ppo_loss(
                        neglogp=neglogp,
                        oldneglogp=inputs.neglogp,
                        vpred=vf,
                        R=inputs.R,
                        V=inputs.V,
                        masks=None,
                        reward_weights=nc.reward_weights,
                        adv_normalize=True,
                        sync_statistics=nc.sync_statistics)
                elif nc.use_loss_type == 'rl_vtrace':

                    def _batch_to_TB(tsr):
                        return tf.transpose(
                            tf.reshape(tsr,
                                       shape=(nc.nrollout, nc.rollout_len)))

                    lam = tf.convert_to_tensor(nc.lam, tf.float32)
                    vpred_list = [
                        _batch_to_TB(v) for v in tf.split(vf, nc.n_v, axis=1)
                    ]
                    reward_list = [
                        _batch_to_TB(r)
                        for r in tf.split(inputs.r, nc.n_v, axis=1)
                    ]
                    discounts = _batch_to_TB(inputs.discount)
                    value_loss = []
                    for values, rewards in zip(vpred_list, reward_list):
                        value_loss.append(
                            tp_losses.td_lambda(values,
                                                rewards,
                                                discounts,
                                                lam=lam))
                    value_loss = tf.stack(value_loss)

                    neglogp_list = [
                        _batch_to_TB(neglogp)
                        for neglogp in nest.flatten(neglogp)
                    ]
                    oldneglogp_list = [
                        _batch_to_TB(oldneglogp)
                        for oldneglogp in nest.flatten(inputs.neglogp)
                    ]
                    shaped_values = tf.matmul(vf,
                                              nc.reward_weights,
                                              transpose_b=True)
                    shaped_rewards = tf.matmul(inputs.r,
                                               nc.reward_weights,
                                               transpose_b=True)
                    values = tf.transpose(
                        tf.reshape(shaped_values,
                                   shape=(nc.nrollout, nc.rollout_len)))
                    rewards = tf.transpose(
                        tf.reshape(shaped_rewards,
                                   shape=(nc.nrollout, nc.rollout_len)))
                    pg_loss = tf.reduce_sum([
                        tp_losses.vtrace_loss(neglogp, oldneglogp, None,
                                              values, rewards, discounts, 1.0,
                                              1.0) for oldneglogp, neglogp in
                        zip(oldneglogp_list, neglogp_list)
                    ])
                    upgo_loss = tp_losses.upgo_loss(
                        tf.stack(neglogp_list, axis=-1),
                        tf.stack(oldneglogp_list, axis=-1), None,
                        vpred_list[0], reward_list[0], discounts)
                    loss_endpoints['upgo_loss'] = upgo_loss
                loss_endpoints['pg_loss'] = pg_loss
                if len(value_loss.shape) == 0:
                    loss_endpoints['value_loss'] = value_loss
                else:
                    for i in range(value_loss.shape[0]):
                        loss_endpoints['value_loss_' + str(i)] = value_loss[i]
                loss = ConvLstmLosses(total_reg_loss=total_reg_loss,
                                      pg_loss=pg_loss,
                                      value_loss=value_loss,
                                      entropy_loss=entropy_loss,
                                      loss_endpoints=loss_endpoints)
                # collect vars, endpoints, etc.
        trainable_vars = _make_vars(sc)
        endpoints = OrderedDict()  # TODO
    return ConvLstmOutputs(self_fed_heads=heads,
                           outer_fed_heads=heads,
                           S=hs_new,
                           loss=loss,
                           vars=trainable_vars,
                           endpoints=endpoints,
                           value_head=vf)
예제 #17
0
def mnet_v6d6_loss(inputs: MNetV6Inputs,
                   outer_fed_heads,
                   value_head,
                   consts: MNetV6Consts,
                   nc: MNetV6Config,
                   net_level_scope: str,
                   structured_mw=None,
                   scope=None):
    # regularization loss. Only `variable`s are involved, so it is safe to
    # collect them using regular expression, e.g., 'mnet_v5.*', regardless
    # of the current name_scope (e.g., 'mnet_v5_1', 'mnet_v5_2', ...)
    total_reg_loss = tf.losses.get_regularization_loss(
        scope='{}.*'.format(net_level_scope))

    total_il_loss = None
    pg_loss = None
    value_loss = None
    entropy_loss = None
    distill_loss = None
    loss_endpoints = {}
    example_ac_sp = tp_utils.map_gym_space_to_structure(
        lambda x: None, nc.ac_space)
    with tf.variable_scope(scope, default_name='mnet_v6_losses'):
        if nc.use_loss_type in ['il', 'rl', 'rl_ppo', 'rl_ppo2', 'rl_vtrace']:
            # head masks and structure template
            if structured_mw is None:
                mw = _action_mask_weights(inputs_ab=inputs.A['A_AB'],
                                          inputs_arg_mask=consts.arg_mask,
                                          weights_include_ab=True)
                structured_mw = tp_utils.pack_sequence_as_structure_like_gym_space(
                    nc.ac_space, mw)
            outer_fed_head_pds = nest.map_structure_up_to(
                example_ac_sp, lambda head: head.pd, outer_fed_heads)

            if nc.use_loss_type == 'il':
                # build imitation learning loss the cross entropy
                total_il_loss, head_xe_loss = tp_losses.multi_head_neglogp_loss(
                    inputs_action_pds=outer_fed_head_pds,
                    inputs_action_labels=inputs.A,
                    inputs_mask_weights=structured_mw,
                    set_loss=nc.il_multi_label_loss,
                )
                assert type(head_xe_loss) == OrderedDict
                loss_endpoints = head_xe_loss
            elif nc.use_loss_type in ['rl', 'rl_ppo', 'rl_ppo2', 'rl_vtrace']:
                # build rl losses

                # the entropy regularizer
                entropy_loss = nest.map_structure_up_to(
                    example_ac_sp,
                    lambda head, mask: tf.reduce_mean(head.ent * mask),
                    outer_fed_heads, structured_mw)

                # distillation loss, i.e., the teacher-student KL regularizer
                distill_loss = None
                ab_distill_loss = None
                if nc.distillation:
                    outer_fed_head_pds = nest.map_structure_up_to(
                        example_ac_sp, lambda head: head.pd, outer_fed_heads)
                    distill_loss = tp_losses.distill_loss(
                        student_pds=outer_fed_head_pds,
                        teacher_logits=inputs.logits,
                        masks=structured_mw)
                    ab_pd = outer_fed_head_pds['A_AB']
                    teacher_logit = inputs.logits['A_AB']
                    # TODO: this is from definition of position encoding, remove it?
                    first_4mins_mask = tf.cast(
                        inputs.X['X_VEC_GAME_PROG'][:, -1] >= np.cos(
                            60 * 4 * np.power(10000, -62 / 64)), tf.float32)
                    first_4mins_mask *= tf.cast((tf.reduce_sum(
                        inputs.X['Z_BUILD_ORDER'], axis=[1, 2]) > 0),
                                                tf.float32)
                    ab_distill_loss = tp_losses.distill_loss(
                        ab_pd, teacher_logit, first_4mins_mask)

                # the main policy gradient loss
                outer_fed_head_neglogp = nest.map_structure_up_to(
                    example_ac_sp, lambda head, ac: head.pd.neglogp(ac),
                    outer_fed_heads, inputs.A)
                loss_endpoints = {}
                if nc.use_loss_type == 'rl' or nc.use_loss_type == 'rl_ppo':
                    # PPO loss
                    pg_loss, value_loss = tp_losses.ppo_loss(
                        outer_fed_head_neglogp,
                        inputs.neglogp,
                        value_head,
                        inputs.R,
                        inputs.V,
                        masks=structured_mw,
                        reward_weights=nc.reward_weights,
                        merge_pi=nc.merge_pi,
                        adv_normalize=nc.adv_normalize,
                        clip_range=nc.clip_range,
                        sync_statistics=nc.sync_statistics,
                    )
                elif nc.use_loss_type in ['rl_ppo2', 'rl_vtrace']:
                    # Note: we need convert the shape (batch_size, ...) to the shape
                    # (T, B, ...) where T=nc.rollout_len, B=nc.nrollout, batch_size=B*T
                    # When computing ppo2-loss and value-loss, only T-1 time steps are
                    # used due to the value bootstrap at the tail. When doing so, the
                    # [:-1] indexing, leading to (T - 1, B, ...) tensor slice, makes life
                    # much easier

                    def _batch_to_TB(tsr):
                        return tf.transpose(
                            tf.reshape(tsr,
                                       shape=(nc.nrollout, nc.rollout_len)))

                    # make the len=n_action_heads lists for action-head stuff
                    # for tensor entry, shape (batch_size, ...) -> shape (T, B, ...)
                    neglogp_list = [
                        _batch_to_TB(neglogp)
                        for neglogp in nest.flatten(outer_fed_head_neglogp)
                    ]
                    oldneglogp_list = [
                        _batch_to_TB(oldneglogp)
                        for oldneglogp in nest.flatten(inputs.neglogp)
                    ]
                    mask_list = [
                        _batch_to_TB(mw) for mw in nest.flatten(structured_mw)
                    ]
                    # make the len=n_v lists for value-head stuff
                    # for tensor entry, shape (batch_size, ...) -> shape (T, B, ...)
                    # as aforementioned
                    vpred_list = [
                        _batch_to_TB(v)
                        for v in tf.split(value_head, nc.n_v, axis=1)
                    ]
                    reward_list = [
                        _batch_to_TB(r)
                        for r in tf.split(inputs.r, nc.n_v, axis=1)
                    ]
                    discounts = _batch_to_TB(inputs.discount)
                    # upgo_loss only use the win_loss, i.e, v[0]
                    upgo_loss = tp_losses.upgo_loss(
                        tf.stack(neglogp_list, axis=-1),
                        tf.stack(oldneglogp_list, axis=-1),
                        tf.stack(mask_list, axis=-1), vpred_list[0],
                        reward_list[0], discounts)
                    loss_endpoints['upgo_loss'] = upgo_loss

                    if nc.use_loss_type == 'rl_ppo2':
                        # PPO2 loss
                        # reward_weights size should be consistent with n_v
                        reward_weights = tf.squeeze(
                            tf.convert_to_tensor(nc.reward_weights,
                                                 tf.float32))
                        assert reward_weights.shape.as_list(
                        )[0] == len(reward_list), (
                            'For ppo2 loss, reward_weight size must be the same with number of'
                            ' value head: each reward_weight element must correspond to one '
                            'value-head exactly.')

                        # lambda for td-lambda or lambda-return
                        assert nc.lam is not None, (
                            'building rl_ppo2, but lam for '
                            'lambda-return is None.')
                        lam = tf.convert_to_tensor(nc.lam, tf.float32)

                        # for each value-head, compute the corresponding policy gradient loss
                        # and the value loss
                        pg_loss, value_loss = [], []
                        for vpred, reward in zip(vpred_list, reward_list):
                            # compute the lambda-Return `R` in shape (T - 1, B)
                            # [:-1] means discarding the last one,
                            # [1:] means an off-one alignment.
                            # back_prop=False means R = tf.stop_gradient(R)
                            with tf.device("/cpu:0"):
                                R = multistep_forward_view(reward[:-1],
                                                           discounts[:-1],
                                                           vpred[1:],
                                                           lambda_=lam,
                                                           back_prop=False)
                            # compute the ppo2 loss using this value-head for each of the
                            # n_action_heads action-head; then reduce them
                            # [:-1] means discarding the last one and using only T - 1 time
                            # steps
                            _ploss = [
                                tp_losses.ppo2_loss(
                                    neglogp[:-1],
                                    oldneglogp[:-1],
                                    tf.stop_gradient(vpred)[:-1],
                                    R,  # has been stop_gradient above
                                    mask[:-1],
                                    adv_normalize=nc.adv_normalize,
                                    clip_range=nc.clip_range,
                                    sync_statistics=nc.sync_statistics)
                                for neglogp, oldneglogp, mask in zip(
                                    neglogp_list, oldneglogp_list, mask_list)
                            ]
                            pg_loss.append(tf.reduce_sum(_ploss))
                            # compute the value loss for this value-head
                            value_loss.append(
                                tf.reduce_mean(0.5 *
                                               tf.square(R - vpred[:-1])))
                        # element-wise times reward_weight and the pg_loss for that value-head
                        pg_loss = tf.stack(
                            pg_loss) * reward_weights  # shape (n_v,)
                        # make the final pg_loss, value_loss in desired format
                        pg_loss = tf.reduce_sum(pg_loss)
                        value_loss = tf.stack(value_loss)
                    else:
                        # vtrace loss
                        # lambda for td-lambda or lambda-return
                        assert nc.lam is not None, (
                            'building rl_vtrace, but lam for '
                            'td-lambda is None.')
                        lam = tf.convert_to_tensor(nc.lam, tf.float32)
                        value_loss = []
                        for values, rewards in zip(vpred_list, reward_list):
                            value_loss.append(
                                tp_losses.td_lambda(values,
                                                    rewards,
                                                    discounts,
                                                    lam=lam))
                        shaped_values = tf.matmul(value_head,
                                                  nc.reward_weights,
                                                  transpose_b=True)
                        shaped_rewards = tf.matmul(inputs.r,
                                                   nc.reward_weights,
                                                   transpose_b=True)
                        values = tf.transpose(
                            tf.reshape(shaped_values,
                                       shape=(nc.nrollout, nc.rollout_len)))
                        rewards = tf.transpose(
                            tf.reshape(shaped_rewards,
                                       shape=(nc.nrollout, nc.rollout_len)))
                        pg_loss = tf.reduce_sum([
                            tp_losses.vtrace_loss(neglogp, oldneglogp, mask,
                                                  values, rewards, discounts,
                                                  1.0, 1.0)
                            for oldneglogp, neglogp, mask in zip(
                                oldneglogp_list, neglogp_list, mask_list)
                        ])
                        value_loss = tf.stack(value_loss)

                # TODO: maybe more rl endpoints
                # policy gradient loss must be scalar
                loss_endpoints['pg_loss'] = pg_loss
                #  value loss can be scalar or vector
                if len(value_loss.shape) == 0:
                    loss_endpoints['value_loss'] = value_loss
                else:
                    for i in range(value_loss.shape[0]):
                        loss_endpoints['value_loss_' + str(i)] = value_loss[i]
                for k, v in entropy_loss.items():
                    loss_endpoints['ent_' + k] = v
                if nc.distillation:
                    for k, v in distill_loss.items():
                        loss_endpoints['distill_' + k] = v
                    loss_endpoints['distill_ab_bf4mins'] = ab_distill_loss
        else:
            print('use_loss_type: {}. Nothing done.'.format(nc.use_loss_type))
            pass

        return MNetV6Losses(total_reg_loss=total_reg_loss,
                            total_il_loss=total_il_loss,
                            pg_loss=pg_loss,
                            value_loss=value_loss,
                            entropy_loss=entropy_loss,
                            distill_loss=distill_loss,
                            loss_endpoints=loss_endpoints)
예제 #18
0
def conv_lstm(inputs: ConvLstmInputs,
              nc: ConvLstmConfig,
              scope=None) -> ConvLstmOutputs:
    """create the whole net for conv-lstm"""
    with tf.variable_scope(scope, default_name='pommerman') as sc:
        # NOTE: use name_scope, in case multiple parameter-sharing nets are built
        net_name_scope = tf.get_default_graph().get_name_scope()
        endpoints_collections = net_name_scope + '_endpoints'
        X = inputs.X
        if nc.n_player == 1:
            X = (X, )
            ac_spaces = (nc.ac_space, )
        else:
            ac_spaces = tuple(nc.ac_space.spaces)
        S = tf.split(inputs.S, nc.n_player, axis=1)
        # make body
        y = []
        hs_new = []
        heads = []
        for input, s, ac_space in zip(X, S, ac_spaces):
            with tf.variable_scope('body', reuse=tf.AUTO_REUSE):
                x = tfc_layers.conv2d(input[0],
                                      nc.spa_ch_dim, [3, 3],
                                      scope='conv0')
                x = tfc_layers.conv2d(x, nc.spa_ch_dim, [5, 5], scope='conv1')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [3, 3],
                                      scope='conv2')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [5, 5],
                                      scope='conv3')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 4, [3, 3],
                                      scope='conv4')
                pos = tf.to_int32(input[1])
                ind = tf.concat(
                    [tf.expand_dims(tf.range(nc.batch_size), 1), pos], axis=1)
                x = tf.gather_nd(x, ind)
                if nc.use_lstm:
                    with tf.variable_scope('lstm_embed'):
                        x, hs = _lstm_embed_block(inputs_x=x,
                                                  inputs_hs=s,
                                                  inputs_mask=inputs.M,
                                                  nc=nc)
                        hs_new.append(hs)
                y.append(x)

            # make action head
            with tf.variable_scope('action', reuse=tf.AUTO_REUSE):
                head_logits = tfc_layers.fully_connected(x,
                                                         ac_space.n,
                                                         activation_fn=None,
                                                         normalizer_fn=None,
                                                         scope='logits')
                if len(input) > 1:
                    head_logits = tp_ops.mask_logits(head_logits, input[2])
                head = tp_layers.to_action_head(head_logits, CategoricalPdType)
                heads.append(head)

        if nc.use_lstm:
            hs_new = tf.concat(hs_new, axis=1)
        y = tf.concat(y, axis=1)
        heads = tp_utils.pack_sequence_as_structure_like_gym_space(
            nc.ac_space, heads)
        if nc.n_player == 1:
            heads = heads[0]
        # make value head
        vf = None
        if nc.use_value_head:
            with tf.variable_scope('vf'):
                vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4)
                vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2)
                vf = tfc_layers.fully_connected(vf,
                                                nc.n_v,
                                                activation_fn=None,
                                                normalizer_fn=None)
        # make loss
        loss = None
        if nc.use_loss_type == 'rl':
            # regularization loss
            total_reg_loss = tf.losses.get_regularization_losses(scope=sc.name)
            with tf.variable_scope('losses'):
                # ppo loss
                neglogp = nest.map_structure_up_to(
                    ac_spaces, lambda head, ac: head.pd.neglogp(ac), heads,
                    inputs.A)
                ppo_loss, value_loss = tp_losses.ppo_loss(
                    neglogp=neglogp,
                    oldneglogp=inputs.neglogp,
                    vpred=vf,
                    R=inputs.R,
                    V=inputs.V,
                    masks=None,
                    reward_weights=None,
                    adv_normalize=True,
                    sync_statistics=nc.sync_statistics)
                # entropy loss
                entropy_loss = nest.map_structure_up_to(
                    ac_spaces, lambda head: tf.reduce_mean(head.ent), heads)
                loss_endpoints = {}
                loss = ConvLstmLosses(total_reg_loss=total_reg_loss,
                                      pg_loss=ppo_loss,
                                      value_loss=value_loss,
                                      entropy_loss=entropy_loss,
                                      loss_endpoints=loss_endpoints)
                # collect vars, endpoints, etc.
        trainable_vars = _make_vars(sc)
        endpoints = OrderedDict()  # TODO
    return ConvLstmOutputs(self_fed_heads=heads,
                           outer_fed_heads=heads,
                           S=hs_new,
                           loss=loss,
                           vars=trainable_vars,
                           endpoints=endpoints,
                           value_head=vf)