def sample_distance_pairs(self, num_samples_per_cell=2, verbose=False): """Sample a set of points from each cell and compute all pairwise distances. This method also writes the resulting distances to disk. Args: num_samples_per_cell: int, number of samples to draw per cell. verbose: bool, whether to print verbose messages. """ paired_states_ph = tf.placeholder(tf.float64, (1, 4), name='paired_states_ph') online_network = tf.make_template('Online', self._network_template) distance = online_network(paired_states_ph) saver = tf.train.Saver() if not self.add_noise: num_samples_per_cell = 1 with tf.Session() as sess: saver.restore(sess, os.path.join(self.base_dir, 'tf_ckpt-239900')) total_samples = None for s_idx in range(self.num_states): s = self.inverse_index_states[s_idx] s = s.astype(np.float32) s += 0.5 # Place in center of cell. s = np.tile([s], (num_samples_per_cell, 1)) if self.add_noise: sampled_noise = np.clip( np.random.normal(0, 0.1, size=(num_samples_per_cell, 2)), -0.3, 0.3) s += sampled_noise if total_samples is None: total_samples = s else: total_samples = np.concatenate([total_samples, s]) num_total_samples = len(total_samples) distances = np.zeros((num_total_samples, num_total_samples)) if verbose: tf.logging.info('Will compute distances for %d samples', num_total_samples) for i in range(num_total_samples): s1 = total_samples[i] if verbose: tf.logging.info('Will compute distances from sample %d', i) for j in range(num_total_samples): s2 = total_samples[j] paired_states_1 = np.reshape(np.append(s1, s2), (1, 4)) paired_states_2 = np.reshape(np.append(s2, s1), (1, 4)) distance_np_1 = sess.run( distance, feed_dict={paired_states_ph: paired_states_1}) distance_np_2 = sess.run( distance, feed_dict={paired_states_ph: paired_states_2}) max_dist = max(distance_np_1, distance_np_2) distances[i, j] = max_dist distances[j, i] = max_dist sampled_distances = { 'samples_per_cell': num_samples_per_cell, 'samples': total_samples, 'distances': distances, } file_path = os.path.join(self.base_dir, 'sampled_distances.pkl') with tf.gfile.GFile(file_path, 'w') as f: pickle.dump(sampled_distances, f)
def __init__(self, num_actions=None, observation_size=None, num_players=None, gamma=0.99, update_horizon=1, min_replay_history=500, update_period=4, stack_size=1, target_update_period=500, epsilon_fn=linearly_decaying_epsilon, epsilon_train=0.02, epsilon_eval=0.001, epsilon_decay_period=1000, graph_template=dqn_template, tf_device='/cpu:*', use_staging=True, optimizer=tf.train.RMSPropOptimizer(learning_rate=.0025, decay=0.95, momentum=0.0, epsilon=1e-6, centered=True)): """Initializes the agent and constructs its graph. Args: num_actions: int, number of actions the agent can take at any state. observation_size: int, size of observation vector. num_players: int, number of players playing this game. gamma: float, discount factor as commonly used in the RL literature. update_horizon: int, horizon at which updates are performed, the 'n' in n-step update. min_replay_history: int, number of stored transitions before training. update_period: int, period between DQN updates. stack_size: int, number of observations to use as state. target_update_period: Update period for the target network. epsilon_fn: Function expecting 4 parameters: (decay_period, step, warmup_steps, epsilon), and which returns the epsilon value used for exploration during training. epsilon_train: float, final epsilon for training. epsilon_eval: float, epsilon during evaluation. epsilon_decay_period: int, number of steps for epsilon to decay. graph_template: function for building the neural network graph. tf_device: str, Tensorflow device on which to run computations. use_staging: bool, when True use a staging area to prefetch the next sampling batch. optimizer: Optimizer instance used for learning. """ self.partial_reload = False tf.logging.info('Creating %s agent with the following parameters:', self.__class__.__name__) tf.logging.info('\t gamma: %f', gamma) tf.logging.info('\t update_horizon: %f', update_horizon) tf.logging.info('\t min_replay_history: %d', min_replay_history) tf.logging.info('\t update_period: %d', update_period) tf.logging.info('\t target_update_period: %d', target_update_period) tf.logging.info('\t epsilon_train: %f', epsilon_train) tf.logging.info('\t epsilon_eval: %f', epsilon_eval) tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) tf.logging.info('\t tf_device: %s', tf_device) tf.logging.info('\t use_staging: %s', use_staging) tf.logging.info('\t optimizer: %s', optimizer) # Global variables. self.num_actions = num_actions self.observation_size = observation_size self.num_players = num_players self.gamma = gamma self.update_horizon = update_horizon self.cumulative_gamma = math.pow(gamma, update_horizon) self.min_replay_history = min_replay_history self.target_update_period = target_update_period self.epsilon_fn = epsilon_fn self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.epsilon_decay_period = epsilon_decay_period self.update_period = update_period self.eval_mode = False self.training_steps = 0 self.batch_staged = False self.optimizer = optimizer with tf.device(tf_device): # Calling online_convnet will generate a new graph as defined in # graph_template using whatever input is passed, but will always share # the same weights. online_convnet = tf.make_template('Online', graph_template) target_convnet = tf.make_template('Target', graph_template) # The state of the agent. The last axis is the number of past observations # that make up the state. states_shape = (1, observation_size, stack_size) self.state = np.zeros(states_shape) self.state_ph = tf.placeholder(tf.uint8, states_shape, name='state_ph') self.legal_actions_ph = tf.placeholder(tf.float32, [self.num_actions], name='legal_actions_ph') self._q = online_convnet(state=self.state_ph, num_actions=self.num_actions) self._replay = self._build_replay_memory(use_staging) self._replay_qs = online_convnet(self._replay.states, self.num_actions) self._replay_next_qt = target_convnet(self._replay.next_states, self.num_actions) self._train_op = self._build_train_op() self._sync_qt_ops = self._build_sync_op() self._q_argmax = tf.argmax(self._q + self.legal_actions_ph, axis=1)[0] # Set up a session and initialize variables. self._sess = tf.Session( '', config=tf.ConfigProto(allow_soft_placement=True)) self._init_op = tf.global_variables_initializer() self._sess.run(self._init_op) self._saver = tf.train.Saver(max_to_keep=3) # This keeps tracks of the observed transitions during play, for each # player. self.transitions = [[] for _ in range(num_players)]
def _build_train_op(self, optimizer): """Build the TensorFlow graph used to learn the bisimulation metric. Args: optimizer: a tf.train optimizer. Returns: A TensorFlow op to minimize the bisimulation loss. """ self.online_network = tf.make_template('Online', self._network_template) self.target_network = tf.make_template('Target', self._network_template) self.s1_ph = tf.placeholder(tf.float64, (self.batch_size, 2), name='s1_ph') self.s2_ph = tf.placeholder(tf.float64, (self.batch_size, 2), name='s2_ph') self.s1_online_distances = self.online_network( self._concat_states(self.s1_ph)) self.s1_target_distances = self.target_network( self._concat_states(self.s1_ph)) self.s2_target_distances = self.target_network( self._concat_states(self.s2_ph)) self.action_ph = tf.placeholder(tf.int32, (self.batch_size,)) self.rewards_ph = tf.placeholder(tf.float64, (self.batch_size,)) # We use an expanding horizon for computing the distances. self.bisim_horizon_ph = tf.placeholder(tf.float64, ()) # bisimulation_target_1 = rew_diff + gamma * next_distance. bisimulation_target_1 = tf.stop_gradient(self._build_bisimulation_target()) # bisimulation_target_2 = curr_distance. bisimulation_target_2 = tf.stop_gradient(self.s1_target_distances) # We slowly taper in the maximum according to the bisim horizon. bisimulation_target = tf.maximum( bisimulation_target_1, bisimulation_target_2 * self.bisim_horizon_ph) # We zero-out diagonal entries, since those are estimating the distance # between a state and itself, which we know to be 0. diagonal_mask = 1.0 - tf.diag(tf.ones(self.batch_size, dtype=tf.float64)) diagonal_mask = tf.reshape(diagonal_mask, (self.batch_size**2, 1)) bisimulation_target *= diagonal_mask bisimulation_estimate = self.s1_online_distances # We start with a mask that includes everything. loss_mask = tf.ones(tf.shape(bisimulation_estimate)) # We have to enforce that states being compared are done only using the same # action. indicators = self.action_ph indicators = tf.cast(indicators, tf.float64) # indicators will initially have shape [batch_size], we first tile it: square_ids = tf.tile([indicators], [self.batch_size, 1]) # We subtract square_ids from its transpose: square_ids = square_ids - tf.transpose(square_ids) # At this point all zero-entries are the ones with equal IDs. # Now we would like to convert the zeros in this matrix to 1s, and make # everything else a 0. We can do this with the following operation: loss_mask = 1 - tf.abs(tf.sign(square_ids)) # Now reshape to match the shapes of the estimate and target. loss_mask = tf.reshape(loss_mask, (self.batch_size**2, 1)) larger_targets = bisimulation_target - bisimulation_estimate larger_targets_count = tf.reduce_sum( tf.cast(larger_targets > 0., tf.float64)) tf.summary.scalar('Learning/LargerTargets', larger_targets_count) tf.summary.scalar('Learning/NumUpdates', tf.count_nonzero(loss_mask)) tf.summary.scalar('Learning/BisimHorizon', self.bisim_horizon_ph) bisimulation_loss = tf.losses.mean_squared_error( bisimulation_target, bisimulation_estimate, weights=loss_mask) tf.summary.scalar('Learning/loss', bisimulation_loss) # Plot average distance between sampled representations. average_distance = tf.reduce_mean(bisimulation_estimate) tf.summary.scalar('Approx/AverageDistance', average_distance) return optimizer.minimize(bisimulation_loss)