def _build_dependencies(self): self.z_mean = FullyConnected(self.mode, num_units=self.latent_dim, name='z_mean') self.z_log_sigma = FullyConnected(self.mode, num_units=self.latent_dim, name='z_log_sigma')
def graph_fn(mode, features, labels=None): kwargs = {} if 'labels' in get_arguments(self._graph_fn): kwargs['labels'] = labels graph_outputs = self._graph_fn(mode=mode, features=features, **kwargs) a = FullyConnected(mode, num_units=self.num_actions)(graph_outputs) v = None if self.dueling is not None: # Q = V(s) + A(s, a) v = FullyConnected(mode, num_units=1)(graph_outputs) if self.dueling == 'mean': q = v + (a - tf.reduce_mean(a, axis=1, keep_dims=True)) elif self.dueling == 'max': q = v + (a - tf.reduce_max(a, axis=1, keep_dims=True)) elif self.dueling == 'naive': q = v + a elif self.dueling is True: q = tf.identity(a) else: raise ValueError("The value `{}` provided for " "dueling is unsupported.".format(self.dueling)) else: q = tf.identity(a) return QModelSpec(graph_outputs=graph_outputs, a=a, v=v, q=q)
def graph_fn(mode, inputs): graph_results = self._graph_fn(mode=mode, inputs=inputs) a = FullyConnected(mode, num_units=self.num_actions)(graph_results) v = None q = a if self.dueling is not None: # Q = V(s) + A(s, a) v = FullyConnected(mode, num_units=1)(graph_results) if self.dueling == 'mean': q = v + (a - tf.reduce_mean(a, axis=1, keep_dims=True)) elif self.dueling == 'max': q = v + (a - tf.reduce_max(a, axis=1, keep_dims=True)) elif self.dueling == 'naive': q = v + a elif self.dueling is True: q = a return {'graph_results': graph_results, 'a': a, 'v': v, 'q': q}
def graph_fn(mode, inputs): graph_outputs = self._graph_fn(mode=mode, inputs=inputs) a = FullyConnected(mode, num_units=self.num_actions)(graph_outputs) if self.is_continuous: values = tf.concat(values=[a, tf.exp(a) + 1], axis=0) distribution = self._build_distribution(values=values) else: values = tf.identity(a) distribution = self._build_distribution(values=a) return PGModelSpec(graph_outputs=graph_outputs, a=a, distribution=distribution, dist_values=values)
def graph_fn(mode, features, labels=None): kwargs = {} if 'labels' in get_arguments(self._graph_fn): kwargs['labels'] = labels graph_outputs = self._graph_fn(mode=mode, features=features, **kwargs) a = FullyConnected(mode, num_units=self.num_actions)(graph_outputs) if self.is_continuous: values = tf.concat(values=[a, tf.exp(a) + 1], axis=0) distribution = self._build_distribution(values=values) else: values = tf.identity(a) distribution = self._build_distribution(values=a) return PGModelSpec( graph_outputs=graph_outputs, a=a, distribution=distribution, dist_values=values)
def _build_loss(self, results, features, labels): """Creates the loss operation Returns: tuple `(losses, loss)`: `losses` are the per-batch losses. `loss` is a single scalar tensor to minimize. """ reward, action, done = labels['reward'], labels['action'], labels[ 'done'] # Lower triangle matrix lt_size = self.num_actions * (self.num_actions + 1) // 2 lt_entries = FullyConnected(self.mode, num_units=lt_size)( self._train_results['graph_results']) lt_matrix = tf.exp(tf.map_fn(tf.diag, lt_entries[:, :self.num_actions])) if self.num_actions > 1: offset = self.num_actions l_columns = list() for zeros, size in enumerate(xrange(self.num_actions - 1, 0, -1), 1): column = tf.pad(lt_entries[:, offset:offset + size], ((0, 0), (zeros, 0))) l_columns.append(column) offset += size lt_matrix += tf.stack(l_columns, 1) # P = LL^T p_matrix = tf.matmul(lt_matrix, tf.transpose(lt_matrix, (0, 2, 1))) action_diff = action - self._train_results['a'] # A = (a - mean)P(a - mean) / 2 advantage = -tf.matmul( tf.expand_dims(action_diff, 1), tf.matmul(p_matrix, tf.expand_dims(action_diff, 2))) / 2 advantage = tf.squeeze(advantage, 2) # Q = V(s) + A(s, a) train_q_value = (self._train_results['v'] + advantage)[:-1] target_q_value = (reward[:-1] + (1.0 - tf.cast(done[:-1], tf.float32)) * self.discount * self._target_results['v'][1:]) return super(NAFModel, self)._build_loss(train_q_value, features, target_q_value)