Exemplo n.º 1
0
def visualize_nn_output(raw, player=0):
    # utils.progress_bar(t,total_steps),t-trajectory_start)
    length = 30
    blank = ' ' * length
    pref = blank if player else ''
    for prob_rtp, piece_value, piece in zip(*raw):
        pi = prob_rtp[:, :, piece]
        unif = np.ones_like(pi) / (pi.shape[0] * pi.shape[1])
        entropy = utils.entropy(pi)
        max_entropy = utils.entropy(unif)
        entr_vis = utils.progress_bar(entropy, max_entropy, length=length)
        print(pref + entr_vis, piece_value[0, 0, piece].round(decimals=3))
Exemplo n.º 2
0
def action_distribution(A):
    # print(A);input()
    if len(A.shape) is not 2:
        raise action_dim_exception(A.shape)
    p = A.ravel()
    a_idx = np.random.choice(np.arange(A.size), p=p)
    (r, t) = np.unravel_index(a_idx, A.shape)
    entropy = utils.entropy(p + 10**-6)
    return (r, t), entropy
Exemplo n.º 3
0
def action_boltzman(A, theta):
    if len(A.shape) is not 2:
        raise action_dim_exception(A.shape)
    x = A.ravel()
    p = softmax(theta * x)
    np.random.choice(np.arange(A.size), p=p)
    (r, t) = np.unravel_index(amax, A.shape)
    entropy = utils.entropy(p)
    return (r, t), entropy
Exemplo n.º 4
0
def action_pareto(A, theta):
    if len(A.shape) is not 2:
        raise action_dim_exception(A.shape)
    x = A.ravel()
    p = utils.pareto(x, temperature=theta)
    a_idx = np.random.choice(np.arange(A.size), p=p)
    (r, t) = np.unravel_index(a_idx, A.shape)
    entropy = utils.entropy(p)
    return (r, t), entropy
Exemplo n.º 5
0
def action_epsilongreedy(A, epsilon):
    if len(A.shape) is not 2:
        raise action_dim_exception(A.shape)
    if np.random.rand() < epsilon:
        r = np.random.choice(np.arange(A.shape[0]))
        t = np.random.choice(np.arange(A.shape[1]))
    else:
        (r, t), _ = action_argmax(A)
    e = min(1, epsilon)
    _entropy = e * np.full(A.size, 1 / A.size)
    _entropy[0] += (1 - e)
    entropy = utils.entropy(_entropy)
    return (r, t), entropy
Exemplo n.º 6
0
    def create_training_ops(
            self,
            policy,
            values,
            target_values,
            advantages,
            actions_training,
            pieces_training,
            old_probs,
            params,
        ):
        clip_param, c1, c2, c3, e = params["clipping_parameter"], params["value_loss"], params["policy_loss"], params["entropy_loss"], 10**-6

        #current pi(a|s)
        r_mask = tf.reshape(tf.one_hot(actions_training[:,0], self.n_rotations),    (-1, self.n_rotations,    1,  1), name='r_mask')
        t_mask = tf.reshape(tf.one_hot(actions_training[:,1], self.n_translations), (-1,  1, self.n_translations, 1), name='t_mask')
        p_mask = tf.reshape(tf.one_hot(pieces_training[:,:],  self.n_pieces),       (-1,  1,  1, self.n_pieces     ), name='p_mask')

        rtp_mask    = r_mask * t_mask * p_mask
        probability = tf.expand_dims(tf.reduce_sum(policy * rtp_mask, axis=[1,2,3]),1)
        values      = tf.reduce_sum(values * p_mask, axis=[2,3])

        #probability ratio
        r = tf.maximum(probability, e) / tf.maximum(old_probs, e)
        clipped_r = tf.clip_by_value( r, 1-clip_param, 1+clip_param )
        r_saturation = tf.reduce_mean(tf.cast(tf.not_equal(r, clipped_r),tf.float32))

        if "compress_advantages" in self.settings:
            adv_compressor = compressor(**self.settings["compress_advantages"])
            advantages = adv_compressor(advantages)
        policy_loss = tf.minimum( r * advantages, clipped_r * advantages )

        #entropy
        entropy_bonus = action_entropy = tf.reduce_sum(N.action_entropy(policy + e) * p_mask, axis=3)
        n_actions = self.n_rotations * self.n_translations
        max_entropy = utils.entropy(np.ones(n_actions)/n_actions)

        if "entropy_floor_loss" in params:
            eps = params["ppo_epsilon"]
            entropy_floor = -eps*tf.math.log( eps/(n_actions-1) ) -(1-eps) * tf.log(1-eps)
            extra_entropy = -tf.nn.relu(entropy_floor - action_entropy)
            entropy_bonus += params["entropy_floor_loss"] * extra_entropy

        if "rescaled_entropy" in params:
            entropy_bonus += params["rescaled_entropy"] * (max_entropy - entropy_bonus)

        #tally up
        self.value_loss_tf   =  c1 * tf.losses.mean_squared_error(values, target_values) #reduce loss
        self.policy_loss_tf  = -c2 * tf.reduce_mean(policy_loss) #increase expected advantages
        self.entropy_loss_tf = -c3 * tf.reduce_mean(entropy_bonus) #increase entropy
        self.regularizer_tf  = self.settings["nn_regularizer"] * tf.add_n([tf.nn.l2_loss(v) for v in self.main_net.variables])
        if "compress_value_loss" in self.settings:
            val_compressor = compressor(**self.settings["compress_value_loss"])
            self.value_loss_tf = val_compressor(self.value_loss_tf)
        self.loss_tf = self.value_loss_tf + self.policy_loss_tf + self.entropy_loss_tf + self.regularizer_tf
        training_ops = self.settings["optimizer"](learning_rate=params['lr']).minimize(self.loss_tf)

        #Stats: we like stats.
        self.output_as_stats( action_entropy, name='entropy/entropy'                      )
        self.output_as_stats( entropy_bonus,  name='entropy/entropy_bonus', only_mean=True)
        self.output_as_stats( values,         name='misc/values'                          )
        self.output_as_stats( target_values,  name='misc/target_values'                   )
        self.output_as_stats( r_saturation,   name='misc/clip_saturation',  only_mean=True)

        if self.settings["compress_advantages"]:
            self.output_as_stats( adv_compressor.x_mean,       name='compressors/advantage/compressor',            only_mean=True)
            self.output_as_stats( adv_compressor.x_max,        name='compressors/advantage/compressor_max',        only_mean=True)
            self.output_as_stats( adv_compressor.x_saturation, name='compressors/advantage/compressor_saturation', only_mean=True)

        if self.settings["compress_value_loss"]:
            self.output_as_stats( val_compressor.x_mean,       name='compressors/valueloss/compressor',            only_mean=True)
            self.output_as_stats( val_compressor.x_max,        name='compressors/valueloss/compressor_max',        only_mean=True)
            self.output_as_stats( val_compressor.x_saturation, name='compressors/valueloss/compressor_saturation', only_mean=True)

        self.output_as_stats( self.loss_tf,         name='losses/total_loss',       only_mean=True)
        self.output_as_stats( self.value_loss_tf,   name='losses/value_loss',       only_mean=True)
        self.output_as_stats(-self.policy_loss_tf,  name='losses/policy_loss',      only_mean=True)
        self.output_as_stats(-self.entropy_loss_tf, name='losses/entropy_loss',     only_mean=True)
        self.output_as_stats( self.regularizer_tf,  name='losses/regularizer_loss', only_mean=True)

        if self.settings.get('record-parameters-to-tb', False):
            for param_name in params:
                self.output_as_stats( params[param_name], name='parameters/'+param_name, only_mean=True)

        return [training_ops, adv_compressor.update_op, val_compressor.update_op]