Пример #1
0
def cont_nn_inputs_placeholder(nc: ContNNConfig):
  """create the inputs placeholder for simple MLPs"""
  X_ph = tp_utils.placeholders_from_gym_space(
    nc.ob_space, batch_size=nc.batch_size, name='ob_ph')

  if nc.test:
    # when testing, there are no ground-truth actions
    A_ph = tp_utils.map_gym_space_to_structure(lambda x: None, nc.ac_space)
  else:
    A_ph = tp_utils.placeholders_from_gym_space(
      nc.ac_space, batch_size=nc.batch_size, name='ac_ph')

  neglogp = tp_utils.map_gym_space_to_structure(
    func=lambda x_sp: tf.placeholder(shape=(nc.batch_size, ),
                                     dtype=tf.float32,
                                     name='neglogp'),
    gym_sp=nc.ac_space
  )
  n_v = 1  # no. of value heads
  R = tf.placeholder(tf.float32, (nc.batch_size, n_v), 'R')
  V = tf.placeholder(tf.float32, (nc.batch_size, n_v), 'V')
  return ContNNInputs(
    X=X_ph,
    A=A_ph,
    neglogp=neglogp,
    R=R,
    V=V
  )
Пример #2
0
 def __init__(self,
              ob_space,
              ac_space,
              use_self_fed_heads=True,
              use_lstm=False,
              hs_len=None):
     shape_dtype = lambda x: (x.shape, x.dtype)
     _fields = ['X']
     specs = [map_gym_space_to_structure(shape_dtype, ob_space)]
     templates = [template_structure_from_gym_space(ob_space)]
     if not use_self_fed_heads:
         _fields.append('A')
         specs.append(map_gym_space_to_structure(shape_dtype, ac_space))
         templates.append(template_structure_from_gym_space(ac_space))
     if use_lstm:
         assert int(hs_len) == hs_len
         _fields.extend(['S', 'M'])
         specs.extend([
             ([hs_len], np.float32),
             ([], np.bool),
         ])
         templates.extend([
             None,
             None,
         ])
     super(InfData, self).__init__(_fields, specs, templates)
Пример #3
0
def conv_lstm_inputs_placeholder(nc: ConvLstmConfig):
    """create the inputs placeholder for pommerman"""
    X_ph = tp_utils.placeholders_from_gym_space(nc.ob_space,
                                                batch_size=nc.batch_size,
                                                name='ob_ph')

    if nc.test:
        # when testing, there are no ground-truth actions
        A_ph = tp_utils.map_gym_space_to_structure(lambda x: None, nc.ac_space)
    else:
        A_ph = tp_utils.placeholders_from_gym_space(nc.ac_space,
                                                    batch_size=nc.batch_size,
                                                    name='ac_ph')

    neglogp = tp_utils.map_gym_space_to_structure(
        func=lambda x_sp: tf.placeholder(
            shape=(nc.batch_size, ), dtype=tf.float32, name='neglogp'),
        gym_sp=nc.ac_space)
    n_v = nc.n_v
    R = tf.placeholder(tf.float32, (nc.batch_size, n_v), 'R')
    V = tf.placeholder(tf.float32, (nc.batch_size, n_v), 'V')
    S = tf.placeholder(tf.float32, (nc.batch_size, nc.hs_len), 'hs')
    M = tf.placeholder(tf.float32, (nc.batch_size, ), 'hsm')

    return ConvLstmInputs(X=X_ph, A=A_ph, neglogp=neglogp, R=R, V=V, S=S, M=M)
Пример #4
0
 def __init__(self,
              ob_space,
              ac_space,
              n_v,
              use_lstm=False,
              hs_len=None,
              distillation=False,
              use_oppo_data=False,
              random_policy=True):
     _fields = ['X', 'A']
     shape_dtype = lambda x: (x.shape, x.dtype)
     specs = [
         map_gym_space_to_structure(shape_dtype, ob_space),
         map_gym_space_to_structure(shape_dtype, ac_space)
     ]
     templates = [
         template_structure_from_gym_space(ob_space),
         template_structure_from_gym_space(ac_space)
     ]
     if random_policy:
         _fields.append('neglogp')
         specs.append(
             map_gym_space_to_structure(lambda x: ([], np.float32),
                                        ac_space))
         templates.append(template_structure_from_gym_space(ac_space))
     if use_lstm:
         assert int(hs_len) == hs_len
         _fields.extend(['S', 'M'])
         specs.extend([
             ([hs_len], np.float32),
             ([], np.bool),
         ])
         templates.extend([
             None,
             None,
         ])
     if distillation:
         _fields.append('flatparam')
         logit_shape_dtype = lambda x: (make_pdtype(x).param_shape(), np.
                                        float32)
         param_shape_dtype = map_gym_space_to_structure(
             logit_shape_dtype, ac_space)
         param_templates = template_structure_from_gym_space(ac_space)
         specs.append(param_shape_dtype)
         templates.append(param_templates)
     if use_oppo_data:
         _fields.append('OPPO_X')
         specs.append(map_gym_space_to_structure(shape_dtype, ob_space))
         templates.append(template_structure_from_gym_space(ob_space))
         if use_lstm:
             _fields.append('OPPO_S')  # oppo's mask is the same as self
             specs.append(([hs_len], np.float32))
             templates.append(None)
     self.specs = specs
     self.templates = templates
     super(PGData, self).__init__(_fields, specs, templates)
Пример #5
0
def ddpg_inputs_placeholder(nc: DDPGConfig):
    """create the inputs placeholder for gym_ddpg"""
    X_ph = tp_utils.placeholders_from_gym_space(nc.ob_space,
                                                batch_size=nc.batch_size,
                                                name='ob_ph')

    if nc.test:
        # when testing, there are no ground-truth actions
        A_ph = tp_utils.map_gym_space_to_structure(lambda x: None, nc.ac_space)
    else:
        A_ph = tp_utils.placeholders_from_gym_space(nc.ac_space,
                                                    batch_size=nc.batch_size,
                                                    name='ac_ph')

    n_v = nc.n_v
    r = tf.placeholder(tf.float32, (nc.batch_size, n_v), 'r')
    discount = tf.placeholder(tf.float32, (nc.batch_size, ), 'discount')
    S = tf.placeholder(tf.float32, (nc.batch_size, nc.hs_len), 'hs')
    M = tf.placeholder(tf.float32, (nc.batch_size, ), 'hsm')

    return DDPGInputs(
        X=X_ph,
        A=A_ph,
        S=S,
        M=M,
        r=r,
        discount=discount,
    )
Пример #6
0
 def __init__(self, ob_space, ac_space, n_v, use_lstm=False, hs_len=None,
              distillation=False, version='v1', use_oppo_data=False):
   _fields = ['X', 'A', 'neglogp']
   shape_dtype = lambda x: (x.shape, x.dtype)
   logit_shape_dtype = lambda x: (make_pdtype(x).param_shape(), np.float32)
   if version == 'v1': # neglogp/logits is one long vector
     neglogp_shape_dtype = ([len(ac_space.spaces)], np.float32)
     neglogp_templates = None
     logits_shape_dtype = (logit_shape_dtype(ac_space), np.float32)
     logits_templates = None
   elif version == 'v2': # neglogp/logits is structure same as ac_space
     neglogp_shape_dtype = map_gym_space_to_structure(lambda x: ([], np.float32), ac_space)
     neglogp_templates = template_structure_from_gym_space(ac_space)
     logits_shape_dtype = map_gym_space_to_structure(logit_shape_dtype, ac_space)
     logits_templates = template_structure_from_gym_space(ac_space)
   else:
     raise KeyError('version not support!')
   specs = [map_gym_space_to_structure(shape_dtype, ob_space),
            map_gym_space_to_structure(shape_dtype, ac_space),
            neglogp_shape_dtype]
   templates = [template_structure_from_gym_space(ob_space),
                template_structure_from_gym_space(ac_space),
                neglogp_templates]
   if use_lstm:
     assert int(hs_len) == hs_len
     _fields.extend(['S', 'M'])
     specs.extend([([hs_len], np.float32),
                   ([], np.bool), ])
     templates.extend([None, None, ])
   if distillation:
     _fields.append('logits')
     specs.append(logits_shape_dtype)
     templates.append(logits_templates)
   if use_oppo_data:
     _fields.append('OPPO_X')
     specs.append(map_gym_space_to_structure(shape_dtype, ob_space))
     templates.append(template_structure_from_gym_space(ob_space))
     if use_lstm:
       _fields.append('OPPO_S')  # oppo's mask is the same as self
       specs.append(([hs_len], np.float32))
       templates.append(None)
   self.specs = specs
   self.templates = templates
   super(PGData, self).__init__(_fields, specs, templates)
Пример #7
0
def mnet_v6d6_loss(inputs: MNetV6Inputs,
                   outer_fed_heads,
                   value_head,
                   consts: MNetV6Consts,
                   nc: MNetV6Config,
                   net_level_scope: str,
                   structured_mw=None,
                   scope=None):
    # regularization loss. Only `variable`s are involved, so it is safe to
    # collect them using regular expression, e.g., 'mnet_v5.*', regardless
    # of the current name_scope (e.g., 'mnet_v5_1', 'mnet_v5_2', ...)
    total_reg_loss = tf.losses.get_regularization_loss(
        scope='{}.*'.format(net_level_scope))

    total_il_loss = None
    pg_loss = None
    value_loss = None
    entropy_loss = None
    distill_loss = None
    loss_endpoints = {}
    example_ac_sp = tp_utils.map_gym_space_to_structure(
        lambda x: None, nc.ac_space)
    with tf.variable_scope(scope, default_name='mnet_v6_losses'):
        if nc.use_loss_type in ['il', 'rl', 'rl_ppo', 'rl_ppo2', 'rl_vtrace']:
            # head masks and structure template
            if structured_mw is None:
                mw = _action_mask_weights(inputs_ab=inputs.A['A_AB'],
                                          inputs_arg_mask=consts.arg_mask,
                                          weights_include_ab=True)
                structured_mw = tp_utils.pack_sequence_as_structure_like_gym_space(
                    nc.ac_space, mw)
            outer_fed_head_pds = nest.map_structure_up_to(
                example_ac_sp, lambda head: head.pd, outer_fed_heads)

            if nc.use_loss_type == 'il':
                # build imitation learning loss the cross entropy
                total_il_loss, head_xe_loss = tp_losses.multi_head_neglogp_loss(
                    inputs_action_pds=outer_fed_head_pds,
                    inputs_action_labels=inputs.A,
                    inputs_mask_weights=structured_mw,
                    set_loss=nc.il_multi_label_loss,
                )
                assert type(head_xe_loss) == OrderedDict
                loss_endpoints = head_xe_loss
            elif nc.use_loss_type in ['rl', 'rl_ppo', 'rl_ppo2', 'rl_vtrace']:
                # build rl losses

                # the entropy regularizer
                entropy_loss = nest.map_structure_up_to(
                    example_ac_sp,
                    lambda head, mask: tf.reduce_mean(head.ent * mask),
                    outer_fed_heads, structured_mw)

                # distillation loss, i.e., the teacher-student KL regularizer
                distill_loss = None
                ab_distill_loss = None
                if nc.distillation:
                    outer_fed_head_pds = nest.map_structure_up_to(
                        example_ac_sp, lambda head: head.pd, outer_fed_heads)
                    distill_loss = tp_losses.distill_loss(
                        student_pds=outer_fed_head_pds,
                        teacher_logits=inputs.logits,
                        masks=structured_mw)
                    ab_pd = outer_fed_head_pds['A_AB']
                    teacher_logit = inputs.logits['A_AB']
                    # TODO: this is from definition of position encoding, remove it?
                    first_4mins_mask = tf.cast(
                        inputs.X['X_VEC_GAME_PROG'][:, -1] >= np.cos(
                            60 * 4 * np.power(10000, -62 / 64)), tf.float32)
                    first_4mins_mask *= tf.cast((tf.reduce_sum(
                        inputs.X['Z_BUILD_ORDER'], axis=[1, 2]) > 0),
                                                tf.float32)
                    ab_distill_loss = tp_losses.distill_loss(
                        ab_pd, teacher_logit, first_4mins_mask)

                # the main policy gradient loss
                outer_fed_head_neglogp = nest.map_structure_up_to(
                    example_ac_sp, lambda head, ac: head.pd.neglogp(ac),
                    outer_fed_heads, inputs.A)
                loss_endpoints = {}
                if nc.use_loss_type == 'rl' or nc.use_loss_type == 'rl_ppo':
                    # PPO loss
                    pg_loss, value_loss = tp_losses.ppo_loss(
                        outer_fed_head_neglogp,
                        inputs.neglogp,
                        value_head,
                        inputs.R,
                        inputs.V,
                        masks=structured_mw,
                        reward_weights=nc.reward_weights,
                        merge_pi=nc.merge_pi,
                        adv_normalize=nc.adv_normalize,
                        clip_range=nc.clip_range,
                        sync_statistics=nc.sync_statistics,
                    )
                elif nc.use_loss_type in ['rl_ppo2', 'rl_vtrace']:
                    # Note: we need convert the shape (batch_size, ...) to the shape
                    # (T, B, ...) where T=nc.rollout_len, B=nc.nrollout, batch_size=B*T
                    # When computing ppo2-loss and value-loss, only T-1 time steps are
                    # used due to the value bootstrap at the tail. When doing so, the
                    # [:-1] indexing, leading to (T - 1, B, ...) tensor slice, makes life
                    # much easier

                    def _batch_to_TB(tsr):
                        return tf.transpose(
                            tf.reshape(tsr,
                                       shape=(nc.nrollout, nc.rollout_len)))

                    # make the len=n_action_heads lists for action-head stuff
                    # for tensor entry, shape (batch_size, ...) -> shape (T, B, ...)
                    neglogp_list = [
                        _batch_to_TB(neglogp)
                        for neglogp in nest.flatten(outer_fed_head_neglogp)
                    ]
                    oldneglogp_list = [
                        _batch_to_TB(oldneglogp)
                        for oldneglogp in nest.flatten(inputs.neglogp)
                    ]
                    mask_list = [
                        _batch_to_TB(mw) for mw in nest.flatten(structured_mw)
                    ]
                    # make the len=n_v lists for value-head stuff
                    # for tensor entry, shape (batch_size, ...) -> shape (T, B, ...)
                    # as aforementioned
                    vpred_list = [
                        _batch_to_TB(v)
                        for v in tf.split(value_head, nc.n_v, axis=1)
                    ]
                    reward_list = [
                        _batch_to_TB(r)
                        for r in tf.split(inputs.r, nc.n_v, axis=1)
                    ]
                    discounts = _batch_to_TB(inputs.discount)
                    # upgo_loss only use the win_loss, i.e, v[0]
                    upgo_loss = tp_losses.upgo_loss(
                        tf.stack(neglogp_list, axis=-1),
                        tf.stack(oldneglogp_list, axis=-1),
                        tf.stack(mask_list, axis=-1), vpred_list[0],
                        reward_list[0], discounts)
                    loss_endpoints['upgo_loss'] = upgo_loss

                    if nc.use_loss_type == 'rl_ppo2':
                        # PPO2 loss
                        # reward_weights size should be consistent with n_v
                        reward_weights = tf.squeeze(
                            tf.convert_to_tensor(nc.reward_weights,
                                                 tf.float32))
                        assert reward_weights.shape.as_list(
                        )[0] == len(reward_list), (
                            'For ppo2 loss, reward_weight size must be the same with number of'
                            ' value head: each reward_weight element must correspond to one '
                            'value-head exactly.')

                        # lambda for td-lambda or lambda-return
                        assert nc.lam is not None, (
                            'building rl_ppo2, but lam for '
                            'lambda-return is None.')
                        lam = tf.convert_to_tensor(nc.lam, tf.float32)

                        # for each value-head, compute the corresponding policy gradient loss
                        # and the value loss
                        pg_loss, value_loss = [], []
                        for vpred, reward in zip(vpred_list, reward_list):
                            # compute the lambda-Return `R` in shape (T - 1, B)
                            # [:-1] means discarding the last one,
                            # [1:] means an off-one alignment.
                            # back_prop=False means R = tf.stop_gradient(R)
                            with tf.device("/cpu:0"):
                                R = multistep_forward_view(reward[:-1],
                                                           discounts[:-1],
                                                           vpred[1:],
                                                           lambda_=lam,
                                                           back_prop=False)
                            # compute the ppo2 loss using this value-head for each of the
                            # n_action_heads action-head; then reduce them
                            # [:-1] means discarding the last one and using only T - 1 time
                            # steps
                            _ploss = [
                                tp_losses.ppo2_loss(
                                    neglogp[:-1],
                                    oldneglogp[:-1],
                                    tf.stop_gradient(vpred)[:-1],
                                    R,  # has been stop_gradient above
                                    mask[:-1],
                                    adv_normalize=nc.adv_normalize,
                                    clip_range=nc.clip_range,
                                    sync_statistics=nc.sync_statistics)
                                for neglogp, oldneglogp, mask in zip(
                                    neglogp_list, oldneglogp_list, mask_list)
                            ]
                            pg_loss.append(tf.reduce_sum(_ploss))
                            # compute the value loss for this value-head
                            value_loss.append(
                                tf.reduce_mean(0.5 *
                                               tf.square(R - vpred[:-1])))
                        # element-wise times reward_weight and the pg_loss for that value-head
                        pg_loss = tf.stack(
                            pg_loss) * reward_weights  # shape (n_v,)
                        # make the final pg_loss, value_loss in desired format
                        pg_loss = tf.reduce_sum(pg_loss)
                        value_loss = tf.stack(value_loss)
                    else:
                        # vtrace loss
                        # lambda for td-lambda or lambda-return
                        assert nc.lam is not None, (
                            'building rl_vtrace, but lam for '
                            'td-lambda is None.')
                        lam = tf.convert_to_tensor(nc.lam, tf.float32)
                        value_loss = []
                        for values, rewards in zip(vpred_list, reward_list):
                            value_loss.append(
                                tp_losses.td_lambda(values,
                                                    rewards,
                                                    discounts,
                                                    lam=lam))
                        shaped_values = tf.matmul(value_head,
                                                  nc.reward_weights,
                                                  transpose_b=True)
                        shaped_rewards = tf.matmul(inputs.r,
                                                   nc.reward_weights,
                                                   transpose_b=True)
                        values = tf.transpose(
                            tf.reshape(shaped_values,
                                       shape=(nc.nrollout, nc.rollout_len)))
                        rewards = tf.transpose(
                            tf.reshape(shaped_rewards,
                                       shape=(nc.nrollout, nc.rollout_len)))
                        pg_loss = tf.reduce_sum([
                            tp_losses.vtrace_loss(neglogp, oldneglogp, mask,
                                                  values, rewards, discounts,
                                                  1.0, 1.0)
                            for oldneglogp, neglogp, mask in zip(
                                oldneglogp_list, neglogp_list, mask_list)
                        ])
                        value_loss = tf.stack(value_loss)

                # TODO: maybe more rl endpoints
                # policy gradient loss must be scalar
                loss_endpoints['pg_loss'] = pg_loss
                #  value loss can be scalar or vector
                if len(value_loss.shape) == 0:
                    loss_endpoints['value_loss'] = value_loss
                else:
                    for i in range(value_loss.shape[0]):
                        loss_endpoints['value_loss_' + str(i)] = value_loss[i]
                for k, v in entropy_loss.items():
                    loss_endpoints['ent_' + k] = v
                if nc.distillation:
                    for k, v in distill_loss.items():
                        loss_endpoints['distill_' + k] = v
                    loss_endpoints['distill_ab_bf4mins'] = ab_distill_loss
        else:
            print('use_loss_type: {}. Nothing done.'.format(nc.use_loss_type))
            pass

        return MNetV6Losses(total_reg_loss=total_reg_loss,
                            total_il_loss=total_il_loss,
                            pg_loss=pg_loss,
                            value_loss=value_loss,
                            entropy_loss=entropy_loss,
                            distill_loss=distill_loss,
                            loss_endpoints=loss_endpoints)