def visualize_nn_output(raw, player=0): # utils.progress_bar(t,total_steps),t-trajectory_start) length = 30 blank = ' ' * length pref = blank if player else '' for prob_rtp, piece_value, piece in zip(*raw): pi = prob_rtp[:, :, piece] unif = np.ones_like(pi) / (pi.shape[0] * pi.shape[1]) entropy = utils.entropy(pi) max_entropy = utils.entropy(unif) entr_vis = utils.progress_bar(entropy, max_entropy, length=length) print(pref + entr_vis, piece_value[0, 0, piece].round(decimals=3))
def action_distribution(A): # print(A);input() if len(A.shape) is not 2: raise action_dim_exception(A.shape) p = A.ravel() a_idx = np.random.choice(np.arange(A.size), p=p) (r, t) = np.unravel_index(a_idx, A.shape) entropy = utils.entropy(p + 10**-6) return (r, t), entropy
def action_boltzman(A, theta): if len(A.shape) is not 2: raise action_dim_exception(A.shape) x = A.ravel() p = softmax(theta * x) np.random.choice(np.arange(A.size), p=p) (r, t) = np.unravel_index(amax, A.shape) entropy = utils.entropy(p) return (r, t), entropy
def action_pareto(A, theta): if len(A.shape) is not 2: raise action_dim_exception(A.shape) x = A.ravel() p = utils.pareto(x, temperature=theta) a_idx = np.random.choice(np.arange(A.size), p=p) (r, t) = np.unravel_index(a_idx, A.shape) entropy = utils.entropy(p) return (r, t), entropy
def action_epsilongreedy(A, epsilon): if len(A.shape) is not 2: raise action_dim_exception(A.shape) if np.random.rand() < epsilon: r = np.random.choice(np.arange(A.shape[0])) t = np.random.choice(np.arange(A.shape[1])) else: (r, t), _ = action_argmax(A) e = min(1, epsilon) _entropy = e * np.full(A.size, 1 / A.size) _entropy[0] += (1 - e) entropy = utils.entropy(_entropy) return (r, t), entropy
def create_training_ops( self, policy, values, target_values, advantages, actions_training, pieces_training, old_probs, params, ): clip_param, c1, c2, c3, e = params["clipping_parameter"], params["value_loss"], params["policy_loss"], params["entropy_loss"], 10**-6 #current pi(a|s) r_mask = tf.reshape(tf.one_hot(actions_training[:,0], self.n_rotations), (-1, self.n_rotations, 1, 1), name='r_mask') t_mask = tf.reshape(tf.one_hot(actions_training[:,1], self.n_translations), (-1, 1, self.n_translations, 1), name='t_mask') p_mask = tf.reshape(tf.one_hot(pieces_training[:,:], self.n_pieces), (-1, 1, 1, self.n_pieces ), name='p_mask') rtp_mask = r_mask * t_mask * p_mask probability = tf.expand_dims(tf.reduce_sum(policy * rtp_mask, axis=[1,2,3]),1) values = tf.reduce_sum(values * p_mask, axis=[2,3]) #probability ratio r = tf.maximum(probability, e) / tf.maximum(old_probs, e) clipped_r = tf.clip_by_value( r, 1-clip_param, 1+clip_param ) r_saturation = tf.reduce_mean(tf.cast(tf.not_equal(r, clipped_r),tf.float32)) if "compress_advantages" in self.settings: adv_compressor = compressor(**self.settings["compress_advantages"]) advantages = adv_compressor(advantages) policy_loss = tf.minimum( r * advantages, clipped_r * advantages ) #entropy entropy_bonus = action_entropy = tf.reduce_sum(N.action_entropy(policy + e) * p_mask, axis=3) n_actions = self.n_rotations * self.n_translations max_entropy = utils.entropy(np.ones(n_actions)/n_actions) if "entropy_floor_loss" in params: eps = params["ppo_epsilon"] entropy_floor = -eps*tf.math.log( eps/(n_actions-1) ) -(1-eps) * tf.log(1-eps) extra_entropy = -tf.nn.relu(entropy_floor - action_entropy) entropy_bonus += params["entropy_floor_loss"] * extra_entropy if "rescaled_entropy" in params: entropy_bonus += params["rescaled_entropy"] * (max_entropy - entropy_bonus) #tally up self.value_loss_tf = c1 * tf.losses.mean_squared_error(values, target_values) #reduce loss self.policy_loss_tf = -c2 * tf.reduce_mean(policy_loss) #increase expected advantages self.entropy_loss_tf = -c3 * tf.reduce_mean(entropy_bonus) #increase entropy self.regularizer_tf = self.settings["nn_regularizer"] * tf.add_n([tf.nn.l2_loss(v) for v in self.main_net.variables]) if "compress_value_loss" in self.settings: val_compressor = compressor(**self.settings["compress_value_loss"]) self.value_loss_tf = val_compressor(self.value_loss_tf) self.loss_tf = self.value_loss_tf + self.policy_loss_tf + self.entropy_loss_tf + self.regularizer_tf training_ops = self.settings["optimizer"](learning_rate=params['lr']).minimize(self.loss_tf) #Stats: we like stats. self.output_as_stats( action_entropy, name='entropy/entropy' ) self.output_as_stats( entropy_bonus, name='entropy/entropy_bonus', only_mean=True) self.output_as_stats( values, name='misc/values' ) self.output_as_stats( target_values, name='misc/target_values' ) self.output_as_stats( r_saturation, name='misc/clip_saturation', only_mean=True) if self.settings["compress_advantages"]: self.output_as_stats( adv_compressor.x_mean, name='compressors/advantage/compressor', only_mean=True) self.output_as_stats( adv_compressor.x_max, name='compressors/advantage/compressor_max', only_mean=True) self.output_as_stats( adv_compressor.x_saturation, name='compressors/advantage/compressor_saturation', only_mean=True) if self.settings["compress_value_loss"]: self.output_as_stats( val_compressor.x_mean, name='compressors/valueloss/compressor', only_mean=True) self.output_as_stats( val_compressor.x_max, name='compressors/valueloss/compressor_max', only_mean=True) self.output_as_stats( val_compressor.x_saturation, name='compressors/valueloss/compressor_saturation', only_mean=True) self.output_as_stats( self.loss_tf, name='losses/total_loss', only_mean=True) self.output_as_stats( self.value_loss_tf, name='losses/value_loss', only_mean=True) self.output_as_stats(-self.policy_loss_tf, name='losses/policy_loss', only_mean=True) self.output_as_stats(-self.entropy_loss_tf, name='losses/entropy_loss', only_mean=True) self.output_as_stats( self.regularizer_tf, name='losses/regularizer_loss', only_mean=True) if self.settings.get('record-parameters-to-tb', False): for param_name in params: self.output_as_stats( params[param_name], name='parameters/'+param_name, only_mean=True) return [training_ops, adv_compressor.update_op, val_compressor.update_op]