def get_input_tensor(self, embed_keep_prob=None, variable_scope=None, reuse=True): """""" #pdb.set_trace() embed_keep_prob = embed_keep_prob or self.embed_keep_prob #pdb.set_trace() with tf.variable_scope('elmo_vocab'): if self.strategy == 'three_layers': weight = tf.get_variable('softmax_weight', shape=[3], initializer=tf.ones_initializer) softmax_weight = tf.nn.softmax(weight) layer = tf.einsum('nabc,b->nac', self.placeholder, weight) elif self.strategy == 'two_layers': weight = tf.get_variable('weight', initializer=0.5) layer = self.placeholder[:, :, -1] * weight + ( 1 - weight) * self.placeholder[:, :, -2] elif self.strategy == 'third_layer': layer = self.placeholder[:, :, -1, :] elif self.strategy == 'second_layer': layer = self.placeholder[:, :, -2, :] else: assert 0, 'please specify the strategy' scalar = tf.get_variable('scalar', shape=[1], initializer=tf.ones_initializer) layer = scalar * layer layer = classifiers.hidden(layer, self.linear_size, hidden_func=self.hidden_func) return layer
def get_input_tensor(self, embed_keep_prob=None, variable_scope=None, reuse=True): """""" #pdb.set_trace() embed_keep_prob = embed_keep_prob or self.embed_keep_prob #pdb.set_trace() outputs=self.placeholder with tf.variable_scope('dephead_bert_vocab'): layer=classifiers.hidden(outputs,self.linear_size,hidden_func=self.hidden_func) return layer
def get_linear_classifier(self, layer, token_weights, last_output=None, variable_scope=None, reuse=False): """""" if last_output is not None: n_layers = 0 layer = last_output['hidden_layer'] recur_layer = last_output['recur_layer'] else: n_layers = self.n_layers recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, n_layers): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('Classifier'): logits = classifiers.linear_classifier(layer, len(self), hidden_keep_prob=hidden_keep_prob) targets = self.placeholder #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x c) -> (n x m x c) probabilities = tf.nn.softmax(logits) # (n x m), (n x m x c), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(targets, predictions) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['hidden_layer'] = layer outputs['targets'] = targets outputs['probabilities'] = probabilities outputs['loss'] = loss outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def forward(self, layers, decoder_embeddings, sentence_feat, token_weights, sequence_length, input_feed=None, target_copy_hidden_states=None, coverage=None,\ variable_scope=None, reuse=False, debug=False): """ decoder embeddings [batch_size, decoder_seq_length, embedding_size] layers: outputs of BiLSTM [batch_size, seq_length, hidden_size] sentence_feat: the final output state of RNN [num_encoder_layers, batch_size, hidden_size] token_weights: mask input_feed: None or [batch_size, 1, hidden_size] target_copy_hidden_states: None or [batch_size, seq_length, hidden_size] coverage: None or [batch_size, 1, encode_seq_length] """ with tf.variable_scope('Seq2SeqDecoder'): with tf.variable_scope('linear'): sentence_feat = classifiers.hidden(sentence_feat, self.recur_size,hidden_func=self.hidden_func,hidden_keep_prob=self.hidden_keep_prob) with tf.variable_scope('memory_linear'): layers = classifiers.hidden(layers, self.recur_size,hidden_func=self.hidden_func,hidden_keep_prob=self.hidden_keep_prob) with tf.variable_scope('embedding_linear'): decoder_embeddings = classifiers.hidden(decoder_embeddings, self.recur_size,hidden_func=self.hidden_func,hidden_keep_prob=self.hidden_keep_prob) result = seq2seq_decoder(self.cell,decoder_embeddings,layers,sequence_length,sentence_feat) return result
def get_input_tensor(self, embed_keep_prob=None, variable_scope=None, reuse=True): """""" #pdb.set_trace() embed_keep_prob = embed_keep_prob or self.embed_keep_prob #pdb.set_trace() if self.pretrained: outputs=self.placeholder else: if self.previous_layer: outputs=self.bert_model.get_all_encoder_layers()[-2] else: outputs=self.bert_model.get_sequence_output() mapping=self.placeholder['mapping'] if self.strategy=="first_value": outputs=tf.batch_gather(outputs,mapping)*tf.cast((mapping>0),dtype=tf.float32)[:,:,None] elif self.strategy=="average": assert 0, "not implemented" else: assert 0, "please specify bert strategy" with tf.variable_scope('bert_vocab'): layer=classifiers.hidden(outputs,self.linear_size,hidden_func=self.hidden_func) return layer
def get_input_tensor(self, embed_keep_prob=None, nonzero_init=False, variable_scope=None, reuse=True): """""" embed_keep_prob = embed_keep_prob or self.embed_keep_prob conv_keep_prob = 1. if reuse else self.conv_keep_prob recur_keep_prob = 1. if reuse else self.recur_keep_prob output_keep_prob = 1. if reuse else self.output_keep_prob layers = [] with tf.variable_scope(variable_scope or self.classname) as scope: for i, placeholder in enumerate( self._multibucket.get_placeholders()): if i: scope.reuse_variables() #with tf.device('/gpu:0'): #with tf.device('/gpu:{}'.format(i)): with tf.variable_scope('Embeddings'): layer = embeddings.token_embedding_lookup( len(self), self.embed_size, placeholder, nonzero_init=True, reuse=reuse) seq_lengths = tf.count_nonzero(placeholder, axis=1, dtype=tf.int32) for j in six.moves.range(self.n_layers): conv_width = self.first_layer_conv_width if not j else self.conv_width with tf.variable_scope('RNN-{}'.format(j)): layer, final_states = recurrent.directed_RNN( layer, self.recur_size, seq_lengths, bidirectional=self.bidirectional, recur_cell=self.recur_cell, conv_width=conv_width, recur_func=self.recur_func, conv_keep_prob=conv_keep_prob, recur_keep_prob=recur_keep_prob, cifg=self.cifg, highway=self.highway, highway_func=self.highway_func, bilin=self.bilin) if not self.squeeze_type.startswith('gated'): if self.squeeze_type == 'linear_attention': with tf.variable_scope('Attention'): _, layer = classifiers.linear_attention( layer, hidden_keep_prob=output_keep_prob) elif self.squeeze_type == 'final_hidden': layer, _ = tf.split(final_states, 2, axis=-1) elif self.squeeze_type == 'final_cell': _, layer = tf.split(final_states, 2, axis=-1) elif self.squeeze_type == 'final_state': layer = final_states elif self.squeeze_type == 'reduce_max': layer = tf.reduce_max(layer, axis=-2) with tf.variable_scope('Linear'): layer = classifiers.hidden( layer, self.output_size, hidden_func=self.output_func, hidden_keep_prob=output_keep_prob) else: with tf.variable_scope('Attention'): attn, layer = classifiers.deep_linear_attention( layer, self.output_size, hidden_func=nonlin.identity, hidden_keep_prob=output_keep_prob) if self.squeeze_type == 'gated_reduce_max': layer = tf.nn.relu(tf.reduce_max( layer, axis=-2)) + .1 * tf.reduce_sum( layer, axis=-2) / (tf.count_nonzero( layer, axis=-2, dtype=tf.float32) + 1e-12) elif self.squeeze_type == 'gated_reduce_sum': layer = self.output_func(tf.reduce_sum(layer, axis=-2)) #layer = tf.tf.Print(layer, [tf.shape(layer)]) layers.append(layer) # Concatenate all the buckets' embeddings layer = tf.concat(layers, 0) # Put them in the right order, creating the embedding matrix layer = tf.nn.embedding_lookup(layer, self._multibucket.placeholder) #layer = tf.nn.embedding_lookup(layers, self._multibucket.placeholder, partition_strategy='div') #layer = tf.Print(layer, [tf.shape(layer)]) # Get the embeddings from the embedding matrix layer = tf.nn.embedding_lookup(layer, self.placeholder) if embed_keep_prob < 1: layer = self.drop_func(layer, embed_keep_prob) return layer
def get_bilinear_classifier(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear linearize = self.linearize distance = self.distance n_splits = 2*(1+linearize+distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, n_splits*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, n_splits*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) if linearize: lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if distance: dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Attention'): if self.diagonal: logits, _ = classifiers.diagonal_bilinear_attention( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: logits, _ = classifiers.bilinear_attention( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets targets = self.placeholder shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) if linearize: # Wherever the head is to the left # (n x m), (1 x m) -> (n x m) lin_targets = tf.to_float(tf.less(targets, ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus(tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if distance: # (n x m) - (1 x m) -> (n x m) dist_targets = tf.abs(targets - ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float(tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1]))+1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2/2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m) + (m) -> (n x m) non_pads = tf.to_float(token_weights) + tf.to_float(tf.logical_not(tf.cast(tf.range(bucket_size), dtype=tf.bool))) # (n x m x m) o (n x 1 x m) -> (n x m x m) probabilities = tf.nn.softmax(logits) * tf.expand_dims(non_pads, -2) # (n x m), (n x m x m), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy( targets, logits, weights=token_weights) # (n x m) -> (n x m x m x 1) one_hot_targets = tf.expand_dims(tf.one_hot(targets, bucket_size), -1) # (n x m) -> () n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if linearize: # (n x m x m) -> (n x m x 1 x m) lin_xent_reshaped = tf.expand_dims(lin_xent, -2) # (n x m x 1 x m) * (n x m x m x 1) -> (n x m x 1 x 1) lin_target_xent = tf.matmul(lin_xent_reshaped, one_hot_targets) # (n x m x 1 x 1) -> (n x m) lin_target_xent = tf.squeeze(lin_target_xent, [-1, -2]) # (n x m), (n x m), (n x m) -> () loss -= tf.reduce_sum(lin_target_xent*tf.to_float(token_weights)) / (n_tokens + 1e-12) if distance: # (n x m x m) -> (n x m x 1 x m) dist_kld_reshaped = tf.expand_dims(dist_kld, -2) # (n x m x 1 x m) * (n x m x m x 1) -> (n x m x 1 x 1) dist_target_kld = tf.matmul(dist_kld_reshaped, one_hot_targets) # (n x m x 1 x 1) -> (n x m) dist_target_kld = tf.squeeze(dist_target_kld, [-1, -2]) # (n x m), (n x m), (n x m) -> () loss -= tf.reduce_sum(dist_target_kld*tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(targets, predictions) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['predictions'] = predictions outputs['correct_unlabeled_tokens'] = correct_tokens outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum(correct_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear n_splits = 2*(1+self.linearize+self.distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, n_splits*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, n_splits*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) if self.linearize: lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if self.distance: dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) unlabeled_targets = self.placeholder shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) if self.linearize: # Wherever the head is to the left # (n x m x m), (1 x m x 1) -> (n x m x m) lin_targets = tf.to_float(tf.less(unlabeled_targets, dep_ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus(tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if self.distance: # (n x m x m) - (1 x m x 1) -> (n x m x m) dist_targets = tf.abs(unlabeled_targets - dep_ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float(tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1]))+1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2/2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) * tf.to_float(token_weights) # (n x m x m), (n x m x m), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy(unlabeled_targets, logits, weights=token_weights) n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if self.linearize: lin_target_xent = lin_xent * unlabeled_targets loss -= tf.reduce_sum(lin_target_xent * tf.to_float(token_weights)) / (n_tokens + 1e-12) if self.distance: dist_target_kld = dist_kld * unlabeled_targets loss -= tf.reduce_sum(dist_target_kld * tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m) -> (n x m x m) predictions = nn.greater(logits, 0, dtype=tf.int64) * token_weights # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['logits'] = logits outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_linear_classifier(self, layer, token_weights, last_output=None, variable_scope=None, reuse=False): """""" if last_output: n_layers = 0 layer = last_output['hidden_layer'] recur_layer = last_output['recur_layer'] else: n_layers = self.n_layers recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden( layer, self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('Classifier'): probabilities = [] loss = [] predictions = [] correct_tokens = [] for i, feat in enumerate(self._feats): vs_feat = str(feat).replace('[', '-RSB-').replace(']', '-LSB-') with tf.variable_scope(vs_feat): logits = classifiers.linear_classifier( layer, self.getlen(feat), hidden_keep_prob=hidden_keep_prob) targets = self.placeholder[:, :, i] #--------------------------------------------------- # Compute probabilities/cross entropy # (n x m x c) -> (n x m x c) probabilities.append(tf.nn.softmax(logits)) # (n x m), (n x m x c), (n x m) -> () loss.append( tf.losses.sparse_softmax_cross_entropy( targets, logits, weights=token_weights)) #--------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) predictions.append( tf.argmax(logits, axis=-1, output_type=tf.int32)) # (n x m) (*) (n x m) -> (n x m) correct_tokens.append( nn.equal(targets, predictions[-1])) # (n x m) x f -> (n x m x f) predictions = tf.stack(predictions, axis=-1) # (n x m) x f -> (n x m x f) correct_tokens = tf.stack(correct_tokens, axis=-1) # (n x m x f) -> (n x m) correct_tokens = tf.reduce_prod(correct_tokens, axis=-1) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['loss'] = tf.add_n(loss) outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_unfactored_bilinear_classifier(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top' % i): layers = classifiers.hidden(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets targets = self.placeholder # (n x m x m) -> (n x m x m) unlabeled_targets = nn.greater(targets, 0) #----------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m x c) -> (n x m x m x c) probabilities = tf.nn.softmax(transposed_logits) * tf.to_float(tf.expand_dims(token_weights, axis=-1)) # (n x m x m), (n x m x m x c), (n x m x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, transposed_logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = tf.argmax(transposed_logits, axis=-1, output_type=tf.int32) * token_weights # (n x m x m) -> (n x m x m) unlabeled_predictions = nn.greater(predictions, 0) # (n x m x m) (*) (n x m x m) -> (n x m x m) unlabeled_true_positives = unlabeled_predictions * unlabeled_targets true_positives = nn.equal(targets, predictions) * unlabeled_true_positives # (n x m x m) -> () n_predictions = tf.reduce_sum(unlabeled_predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_unlabeled_true_positives = tf.reduce_sum(unlabeled_true_positives) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_unlabeled_false_positives = n_predictions - n_unlabeled_true_positives n_unlabeled_false_negatives = n_targets - n_unlabeled_true_positives n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_unlabeled_true_positives_per_sequence = tf.reduce_sum(unlabeled_true_positives, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_unlabeled_sequences = tf.reduce_sum(nn.equal(n_unlabeled_true_positives_per_sequence, n_targets_per_sequence)) n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['label_targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_unlabeled_true_positives outputs['n_unlabeled_false_positives'] = n_unlabeled_false_positives outputs['n_unlabeled_false_negatives'] = n_unlabeled_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_unlabeled_sequences outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets # (n x m x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] #----------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) #----------------------------------------------------------- # Compute the probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) * tf.to_float(tf.expand_dims(token_weights, axis=-1)) # (n x m x m), (n x m x m x c), (n x m x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, transposed_logits, weights=token_weights*unlabeled_targets) #----------------------------------------------------------- # Compute the predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = tf.argmax(transposed_logits, axis=-1, output_type=tf.int64) # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = nn.equal(label_targets, predictions) * unlabeled_predictions correct_label_tokens = nn.equal(label_targets, predictions) * unlabeled_targets # (n x m x m) -> () n_unlabeled_predictions = tf.reduce_sum(unlabeled_predictions) n_unlabeled_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) n_correct_label_tokens = tf.reduce_sum(correct_label_tokens) # () - () -> () n_false_positives = n_unlabeled_predictions - n_true_positives n_false_negatives = n_unlabeled_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) n_correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) n_correct_label_sequences = tf.reduce_sum(nn.equal(n_correct_label_tokens_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences outputs['n_correct_label_tokens'] = n_correct_label_tokens outputs['n_correct_label_sequences'] = n_correct_label_sequences # ============================================================== outputs['label_predictions'] = predictions outputs['label_targets'] = label_targets outputs['label_logits'] = transposed_logits # pdb.set_trace() #=============================================================== return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top' % i): layers = classifiers.hiddens(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) unlabeled_targets = nn.greater(self.placeholder, 0) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) # (n x m x m), (n x m x m x c), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy(unlabeled_targets, logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = nn.greater(logits, 0, dtype=tf.int32) * token_weights # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_unfactored_bilinear_classifier(self, layer, unlabeled_targets, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # c (*) (n x m) + (n x m) #targets = len(self) * unlabeled_targets + self.placeholder targets = bucket_size * self.placeholder + unlabeled_targets #------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x cm) reshaped_logits = tf.reshape(logits, tf.stack([-1, bucket_size, bucket_size * len(self)])) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x cm) -> (n x m x cm) probabilities = tf.nn.softmax(reshaped_logits) # (n x m x cm) -> (n x m x c x m) probabilities = tf.reshape(probabilities, tf.stack([-1, bucket_size, len(self), bucket_size])) # (n x m x c x m) -> (n x m x m x c) probabilities = tf.transpose(probabilities, [0,1,3,2]) # (n x m), (n x m x cm), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, reshaped_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x cm) -> (n x m) predictions = tf.argmax(reshaped_logits, axis=-1, output_type=tf.int32) # (n x m), () -> (n x m) unlabeled_predictions = tf.mod(predictions, bucket_size) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(predictions, targets) * token_weights correct_unlabeled_tokens = nn.equal(unlabeled_predictions, unlabeled_targets) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) correct_unlabeled_tokens_per_sequence = tf.reduce_sum(correct_unlabeled_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) correct_unlabeled_sequences = nn.equal(tokens_per_sequence, correct_unlabeled_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_unlabeled_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum(correct_unlabeled_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" layer1 = layer2 = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # (n x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] # (n x m) -> (n x m x m) unlabeled_predictions = tf.one_hot(unlabeled_predictions, bucket_size) unlabeled_targets = tf.one_hot(unlabeled_targets, bucket_size) # (n x m x m) -> (n x m x m x 1) unlabeled_predictions = tf.expand_dims(unlabeled_predictions, axis=-1) unlabeled_targets = tf.expand_dims(unlabeled_targets, axis=-1) #------------------------------------------------------- # Process the logits # We use the gold heads for computing the label score and the predicted # heads for computing the unlabeled attachment score # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) # (n x m x c x m) * (n x m x m x 1) -> (n x m x c x 1) predicted_logits = tf.matmul(logits, unlabeled_predictions) oracle_logits = tf.matmul(logits, unlabeled_targets) # (n x m x c x 1) -> (n x m x c) predicted_logits = tf.squeeze(predicted_logits, axis=-1) oracle_logits = tf.squeeze(oracle_logits, axis=-1) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) # (n x m), (n x m x c), (n x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, oracle_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) label_predictions = tf.argmax(predicted_logits, axis=-1, output_type=tf.int32) label_oracle_predictions = tf.argmax(oracle_logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_label_tokens = nn.equal(label_targets, label_oracle_predictions) * token_weights correct_tokens = nn.equal(label_targets, label_predictions) * outputs['correct_unlabeled_tokens'] # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=-1) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_label_sequences = nn.equal(tokens_per_sequence, correct_label_tokens_per_sequence) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets # This way we can reconstruct the head_probabilities by exponentiating and summing along the last axis outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['label_predictions'] = label_predictions outputs['n_correct_label_tokens'] = tf.reduce_sum(correct_label_tokens) outputs['n_correct_label_sequences'] = tf.reduce_sum(correct_label_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_sampled_linear_classifier(self, layer, n_samples, token_weights=None, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) batch_size, bucket_size, input_size = nn.get_sizes(layer) layer = nn.dropout(layer, hidden_keep_prob, noise_shape=[batch_size, 1, input_size]) layer = nn.reshape(layer, [-1, input_size]) with tf.variable_scope('Classifier'): # (s) samples, _, _ = tf.nn.log_uniform_candidate_sampler( nn.zeros([bucket_size,1], dtype=tf.int64), 1, n_samples, unique=True, range_max=len(self)) with tf.device('/gpu:1'): weights = tf.get_variable('Weights', shape=[len(self), input_size], initializer=tf.zeros_initializer) biases = tf.get_variable('Biases', shape=len(self), initializer=tf.zeros_initializer) tf.add_to_collection('non_save_variables', weights) tf.add_to_collection('non_save_variables', biases) # (nm x 1) targets = nn.reshape(self.placeholder, [-1, 1]) # (1 x s) samples = tf.expand_dims(samples, 0) # (nm x s) samples = tf.to_int32(nn.tile(samples, [batch_size*bucket_size, 1])) # (nm x s) sample_weights = tf.to_float(nn.not_equal(samples, targets)) # (nm x 1+s) cands = tf.stop_gradient(tf.concat([targets, samples], axis=-1)) # (nm x 1), (nm x s) -> (nm x 1+s) cand_weights = tf.stop_gradient(tf.concat([nn.ones([batch_size*bucket_size, 1]), sample_weights], axis=-1)) # (c x d), (nm x 1+s) -> (nm x 1+s x d) weights = tf.nn.embedding_lookup(weights, cands) # (c), (nm x 1+s) -> (nm x 1+s) biases = tf.nn.embedding_lookup(biases, cands) # (n x m x d) -> (nm x d x 1) layer_reshaped = nn.reshape(layer, [-1, input_size, 1]) # (nm x 1+s x d) * (nm x d x 1) -> (nm x 1+s x 1) logits = tf.matmul(weights, layer_reshaped) # (nm x 1+s x 1) -> (nm x 1+s) logits = tf.squeeze(logits, -1) #----------------------------------------------------------- # Compute probabilities/cross entropy # (nm x 1+s) logits = logits - tf.reduce_max(logits, axis=-1, keep_dims=True) # (nm x 1+s) exp_logits = tf.exp(logits) * cand_weights # (nm x 1) exp_logit_sum = tf.reduce_sum(exp_logits, axis=-1, keep_dims=True) # (nm x 1+s) probabilities = exp_logits / exp_logit_sum # (nm x 1+s) -> (n x m x 1+s) probabilities = nn.reshape(probabilities, [batch_size, bucket_size, 1+n_samples]) # (nm x 1+s) -> (n x m x 1+s) samples = nn.reshape(samples, [batch_size, bucket_size, 1+n_samples]) # (nm x 1+s) -> (nm x 1), (nm x s) target_logits, _ = tf.split(logits, [1, n_samples], axis=1) # (nm x 1) - (nm x 1) -> (nm x 1) loss = tf.log(exp_logit_sum) - target_logits # (n x m) -> (nm x 1) token_weights1D = tf.to_float(nn.reshape(token_weights, [-1,1])) # (nm x 1) -> () loss = tf.reduce_sum(loss*token_weights1D) / tf.reduce_sum(token_weights1D) #----------------------------------------------------------- # Compute predictions/accuracy # (nm x 1+s) -> (n x m x 1+s) logits = nn.reshape(logits, [batch_size, bucket_size, -1]) # (n x m x 1+s) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(predictions, 0) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['targets'] = targets outputs['probabilities'] = tf.tuple([samples, probabilities]) outputs['loss'] = loss outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False, debug=False): """""" #pdb.set_trace() outputs = {} recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear n_splits = 2 * (1 + self.linearize + self.distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers - 1): with tf.variable_scope('FC-%d' % i): #here is FNN? did not run layer = classifiers.hidden( layer, n_splits * self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope( 'FC-top'): #FNN output and split two layer? FNN+split layers = classifiers.hiddens(layer, n_splits * [self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop( 0 ) #layer1 and layer2 are one sentence with different word combination? layer1 head layer2 tail if self.linearize: #false lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if self.distance: #false in graph dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: #only run here logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) #here in fact is a graph, which is m*m representing the connection between each edge unlabeled_targets = self.placeholder #ground truth graph, what is self.placeholder? #USELESS shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) #no running here if self.linearize: #So what is linearize? The linear part of bilinear? # Wherever the head is to the left # (n x m x m), (1 x m x 1) -> (n x m x m) lin_targets = tf.to_float( tf.less(unlabeled_targets, dep_ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus( tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if self.distance: # (n x m x m) - (1 x m x 1) -> (n x m x m) dist_targets = tf.abs(unlabeled_targets - dep_ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float( tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1])) + 1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2 / 2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) if debug: outputs['printdata'] = {} outputs['printdata']['logits'] = logits #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) * tf.to_float( token_weights) #token weights is sentence length? # (n x m x m), (n x m x m), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy( unlabeled_targets, logits, weights=token_weights ) #here label_smoothing is 0, the sigmoid XE have any effect? n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if self.linearize: lin_target_xent = lin_xent * unlabeled_targets loss -= tf.reduce_sum( lin_target_xent * tf.to_float(token_weights)) / (n_tokens + 1e-12) if self.distance: dist_target_kld = dist_kld * unlabeled_targets loss -= tf.reduce_sum( dist_target_kld * tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # precision/recall # (n x m x m) -> (n x m x m) predictions = nn.greater( logits, 0, dtype=tf.int32) * token_weights #edge that predicted # if self.compare_precision: # #pdb.set_trace() # # (n x m x m) -> (n x m) # temp_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # # (n x m) -> (n x m x m) # cond = tf.equal(logits, tf.expand_dims(tf.reduce_max(logits,-1),-1)) # predictions = tf.where(cond, tf.cast(cond,tf.float32), tf.zeros_like(logits)) # predictions = tf.cast(predictions,tf.int32) * token_weights # # # (n x m) (*) (n x m) -> (n x m) # # n_true_positives = tf.reduce_sum(nn.equal(tf.argmax(unlabeled_targets,axis=-1, output_type=tf.int32), temp_predictions) * self.token_weights) # # n_predictions_temp = tf.reduce_sum(temp_predictions) # # n_false_positives = n_predictions_temp - n_true_positives # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1, 2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1, 2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum( nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss if debug: outputs['temp_targets'] = tf.argmax(unlabeled_targets, axis=-1, output_type=tf.int32) # outputs['temp_predictions'] = temp_predictions outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs