def prepare_calculation(self): self.graph = tf.Graph() with self.graph.as_default(): K = self.K mlambda = self.mlambda n_nodes = self.n_nodes deg_vec = self.get_degree_vector() edge_list, weights = self.seperate_nodeid_and_weight( self.edge_list) const_pairs, const_weights = self.seperate_nodeid_and_weight( self.const_pairs) pdb.set_trace() self.A = A = tf.sparse_to_dense(output_shape=[n_nodes, n_nodes], sparse_indices=edge_list, sparse_values=weights) self.O = O = tf.sparse_to_dense(output_shape=[n_nodes, n_nodes], sparse_indices=const_pairs, sparse_values=const_weights) self.P = P = tf.constant(self.get_degree_matrix(O)) self.L = L = P - O degrees = self.get_degree_vector() self.U = U = tf.Variable(self.get_initial_U(degrees, K), name="U") self.Z = Z = tf.Variable(self.get_initial_Z(degrees, K), name="Z") U_norm = self.normalize_U(U) Z_norm = self.get_positive_variable(Z) Y = tf.matmul(U_norm, tf.matmul(Z_norm, U_norm, transpose_b=True)) self.loss = loss = tf.nn.l2_loss(A - Y) adam = tf.AdamOptimizer(self.lr) self.opt = adam.minimize(loss) self.setup_session()
def prepare_calculation(self): self.graph = tf.Graph() with self.graph.as_default(): K = self.K n_nodes = self.n_nodes edge_list, weights = self.seperate_nodeid_and_weight( self.edge_list) const_pairs, const_weights = self.seperate_nodeid_and_weight( self.const_pairs) mlambda = self.mlambda self.A = A = tf.sparse_to_dense(output_shape=[n_nodes, n_nodes], sparse_indices=edge_list, sparse_values=weights) self.O = O = tf.sparse_to_dense(output_shape=[n_nodes, n_nodes], sparse_indices=const_pairs, sparse_values=const_weights) self.D = D = self.get_degree_matrix(O) self.L = L = D - O scaler = 2 * np.sqrt(weights.sum() / (n_nodes * n_nodes * K)) initializer = tf.random_uniform_initializer(maxval=scaler) self.H_var = H_var = tf.get_variable("H_var", shape=[n_nodes, K], initializer=initializer) self.W_var = W_var = tf.get_variable("W_var", shape=[n_nodes, K], initializer=initializer, trainable=(not self.synmetric)) #Positivate H self.H = H = self.get_positive_variable(H_var) self.W = H H_norm = self.normalize_H(H, n_nodes) self.loss = loss = self.loss_LSE(A, H) self.sup_term = sup_term = self.supervisor_term(H_norm, L) self.cost = cost = loss + mlambda * sup_term self.define_tfsummary() if self.optimizer == "adam": optimizer = tf.train.AdamOptimizer(self.lr, epsilon=0.1) else: optimizer = tf.train.GradientDescentOptimizer(self.lr) opt = optimizer.minimize(cost) if self.positivate != "clip": self.opt = opt else: with tf.control_dependencies([opt]): clipped = tf.maximum(H_var,0) clip_H = H_var.assign(clipped) self.opt = tf.group(opt, clip_H) config = tf.ConfigProto(inter_op_parallelism_threads=self.threads, intra_op_parallelism_threads=self.threads) self.sess = tf.Session(config=config) self.init_op = tf.global_variables_initializer()
def testShapeInferenceKnownShape(self): with self.test_session(use_gpu=False): indices = tf.placeholder(tf.int64) shape = [4, 5, 6] output = tf.sparse_to_dense(indices, shape, 1, 0) self.assertEqual(output.get_shape(), [4, 5, 6]) shape = tf.placeholder(tf.int64, shape=(3,)) output = tf.sparse_to_dense(indices, shape, 1, 0) self.assertEqual(output.get_shape().as_list(), [None, None, None])
def map_box_encodings(i): """Produces box K-hot and score encodings for each class index.""" box_mask = tf.equal( unique_indices, i * tf.ones(num_boxes, dtype=tf.int32)) box_mask = tf.reshape(box_mask, [-1]) box_indices = tf.boolean_mask(classes, box_mask) box_confidences = tf.boolean_mask(confidences, box_mask) box_class_encodings = tf.sparse_to_dense( box_indices, [num_classes], 1, validate_indices=False) box_confidence_encodings = tf.sparse_to_dense( box_indices, [num_classes], box_confidences, validate_indices=False) return box_class_encodings, box_confidence_encodings
def build_generator(self): # placeholder is for feeding data image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image]) # (batch_size, dim_image) local_image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image]) query = tf.placeholder(tf.int32, [self.batch_size, MAX_QUERY_WORDS]) query_mask = tf.placeholder(tf.float32, [self.batch_size, MAX_QUERY_WORDS]) bbox = tf.placeholder(tf.float32, [self.batch_size, self.dim_coordinates]) # [image] embed image feature to dim_hidden image_emb = tf.nn.bias_add(tf.matmul(image, self.embed_image_W), self.embed_image_b) # (batch_size, dim_hidden) local_image_emb = tf.nn.bias_add(tf.matmul(local_image, self.embed_local_W), self.embed_local_b) # (batch_size, dim_hidden) score = tf.zeros([self.batch_size], tf.float32) state_lang = tf.zeros([self.batch_size, self.lstm_lang.state_size]) state_context = tf.zeros([self.batch_size, self.lstm_context.state_size]) state_local = tf.zeros([self.batch_size, self.lstm_local.state_size]) query_emb = tf.zeros([self.batch_size, self.dim_hidden]) for j in range(MAX_QUERY_WORDS): # language lstm with tf.variable_scope("lstm_lang"): output_lang, state_lang = self.lstm_lang(query_emb, state_lang) lang = tf.slice(state_lang, [0,0], [self.batch_size, self.dim_hidden]) # context lstm with tf.variable_scope("lstm_context"): output_context, state_context = self.lstm_context(tf.concat(1,[image_emb, lang]), state_context) context = tf.slice(state_context, [0,0], [self.batch_size, self.dim_hidden]) # local lstm with tf.variable_scope("lstm_local"): output_local, state_local = self.lstm_local(tf.concat(1,[local_image_emb, lang, bbox]), state_local) local = tf.slice(state_local, [0,0], [self.batch_size, self.dim_hidden]) context_emb = tf.nn.xw_plus_b(context, self.W_context, self.B_context) local_emb = tf.nn.xw_plus_b(local, self.W_local, self.B_local) word_pred = tf.add(context_emb, local_emb) max_prob_index = tf.argmax(word_pred, 1) # b labels = tf.expand_dims(query[:,j], 1) indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) concated = tf.concat(1, [indices, labels]) with tf.device('/cpu:0'): onehot_labels = tf.sparse_to_dense(concated, tf.pack([self.batch_size, self.dict_words]), 1.0, 0.0) current_score = tf.mul(onehot_labels, word_pred) current_score = tf.reduce_sum(current_score, 1) current_score = tf.mul(current_score, query_mask[:,j]) current_score = tf.reshape(current_score, [1,self.batch_size]) current_score = tf.nn.softmax(current_score) score = tf.add(score, current_score) with tf.device("/cpu:0"): tf.get_variable_scope().reuse_variables() query_emb = tf.nn.embedding_lookup(self.query_emb_W, max_prob_index) return score, image, local_image, query, query_mask, bbox
def build_model(self): video = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps, self.dim_image]) video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps]) HLness = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps]) HLness_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps]) video_flat = tf.reshape(video, [-1, self.dim_image]) image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b) # (batch_size*n_lstm_steps, dim_hidden) image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) image_emb = tf.transpose(image_emb, [1,0,2]) # n x b x h state2 = tf.zeros([self.batch_size, self.lstm2.state_size]) loss_HL = 0.0 _X = tf.reshape(image_emb, [-1, self.dim_hidden]) # (n x b) x h _X = tf.split(0, self.n_lstm_steps, _X) # n x (b x h) [output2, state2] = rnn.rnn(self.lstm_HL_net,_X,dtype=tf.float32) # n x (b x h) output2 = tf.transpose(tf.pack(output2), [1,0,2]) # b x n x h onehot_labels = [] logit_words = [] indices = tf.expand_dims(tf.range(0, self.n_lstm_steps, 1), 1) # n x 1 for ii in xrange(10): labels = tf.expand_dims(HLness[ii,:], 1) # n x 1 concated = tf.concat(1, [indices, labels]) # n x 2 onehot_labels = tf.sparse_to_dense(concated, tf.pack([self.n_lstm_steps, 2]), 1.0, 0.0) # n x 2 logit_words = tf.nn.xw_plus_b(output2[ii,:,:], self.embed_HL_W, self.embed_HL_b) # n x 2 cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels) # n x 1 cross_entropy = tf.mul(cross_entropy, HLness_mask[ii,:]) # n x 1 loss_HL += tf.reduce_sum(cross_entropy) # 1 loss_HL = loss_HL / tf.reduce_sum(HLness_mask) loss = loss_HL return loss, video, video_mask, HLness, HLness_mask
def _input_fn(): with tf.name_scope('input'): filename_queue = tf.train.string_input_producer( filenames, num_epochs=num_epochs) reader = tf.TFRecordReader() _, serialized_example = reader.read_up_to(filename_queue) features = tf.parse_single_example( serialized_examples, { 'words': tf.VarLenFeature(tf.string), 'subreddit': tf.FixedLenFeature([1], tf.int64) } ) padded_words = tf.sparse_to_dense( features['words'].indices, [sentence_length], features['words'].values, default_value='UNK' ) word_indices = tf.string_to_hash_bucket_fast( padded_words, vocab_size) sentences, subreddits = tf.train.shuffle_batch( [word_indices, features['subreddit']], batch_size, capacity=1000 + 3 * batch_size, min_after_dequeue=1000, enqueue_many=False ) return sentences, subreddits
def build_input(data, batch_size, dataset, train): """Build CIFAR image and labels. Args: data_path: Filename for cifar10 data. batch_size: Input batch size. train: True if we are training and false if we are testing. Returns: images: Batches of images of size [batch_size, image_size, image_size, 3]. labels: Batches of labels of size [batch_size, num_classes]. Raises: ValueError: When the specified dataset is not supported. """ image_size = 32 depth = 3 num_classes = 10 if dataset == "cifar10" else 100 images, labels = data num_samples = images.shape[0] - images.shape[0] % batch_size dataset = tf.contrib.data.Dataset.from_tensor_slices( (images[:num_samples], labels[:num_samples])) def map_train(image, label): image = tf.image.resize_image_with_crop_or_pad(image, image_size + 4, image_size + 4) image = tf.random_crop(image, [image_size, image_size, 3]) image = tf.image.random_flip_left_right(image) image = tf.image.per_image_standardization(image) return (image, label) def map_test(image, label): image = tf.image.resize_image_with_crop_or_pad(image, image_size, image_size) image = tf.image.per_image_standardization(image) return (image, label) dataset = dataset.map(map_train if train else map_test) dataset = dataset.batch(batch_size) dataset = dataset.repeat() if train: dataset = dataset.shuffle(buffer_size=16 * batch_size) images, labels = dataset.make_one_shot_iterator().get_next() images = tf.reshape(images, [batch_size, image_size, image_size, depth]) labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) labels = tf.sparse_to_dense( tf.concat([indices, labels], 1), [batch_size, num_classes], 1.0, 0.0) assert len(images.get_shape()) == 4 assert images.get_shape()[0] == batch_size assert images.get_shape()[-1] == 3 assert len(labels.get_shape()) == 2 assert labels.get_shape()[0] == batch_size assert labels.get_shape()[1] == num_classes if not train: tf.summary.image("images", images) return images, labels
def encode_one_hot(label_batch, num_labels): sparse_labels = tf.reshape(label_batch, [-1, 1]) derived_size = tf.shape(label_batch)[0] indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) concated = tf.concat(1, [indices, sparse_labels]) outshape = tf.pack([derived_size, num_labels]) return tf.sparse_to_dense(concated, outshape, sparse_values=1.0, default_value=0.0)
def pad_tensor_to_batch_size(tensor, batch_size): """Pads a Tensor along the batch dimension to the desired batch size.""" if batch_size < 2: raise ValueError("Cannot pad along batch dimension with batch_size < 2.") ndims = len(tensor.shape) if ndims < 1: raise ValueError("Cannot pad a 0-dimensional Tensor") num_pad_examples = batch_size - tf.shape(tensor)[0] # paddings is a 2D Tensor with shape [ndims, 2]. Every element is zero except # for paddings[0][1], which is the number of values to add along the 0-th # dimension (the batch dimension) after the contents of the input tensor. paddings = tf.sparse_to_dense( sparse_indices=[[0, 1]], output_shape=[ndims, 2], sparse_values=num_pad_examples) padded_tensor = tf.pad(tensor, paddings, name=tensor.op.name + "/pad") # Set the new shape. output_shape = tensor.shape.as_list() output_shape[0] = batch_size padded_tensor.set_shape(output_shape) return padded_tensor
def loss(logits, labels): """Add L2Loss to all the trainable variables. Add summary for for "Loss" and "Loss/avg". Args: logits: Logits from inference(). labels: Labels from distorted_inputs or inputs(). 1-D tensor of shape [batch_size] Returns: Loss tensor of type float. """ # Convert from sparse integer labels in the range [0, NUM_CLASSES) # to 1-hot dense float vectors (that is we will have batch_size vectors, # each with NUM_CLASSES values, all of which are 0.0 except there will # be a 1.0 in the entry corresponding to the label). batch_size = tf.size(labels) labels = tf.expand_dims(labels, 1) indices = tf.expand_dims(tf.range(0, batch_size), 1) concated = tf.concat(1, [indices, labels]) onehot_labels = tf.sparse_to_dense( concated, tf.pack([batch_size, NUM_CLASSES]), 1.0, 0.0) cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels, name='xentropy') # Calculate the average cross entropy loss across the batch. cross_entropy_mean = tf.reduce_mean(cross_entropy, name='xentropy_mean') tf.add_to_collection('losses', cross_entropy_mean) # The total loss is defined as the cross entropy loss plus all of the weight # decay terms (L2 loss). return tf.add_n(tf.get_collection('losses'), name='total_loss')
def read_data(filename_queue): """ read_data is an access object to take a .tfrecord and transform it for modeling purposes. it hs both a label and an image associated with it :param filename_queue: The queue runner created by tensorflow :return: An object of the class CIFAR10Record that has both an label and an image value """ class CIFAR10Record(object): pass result = CIFAR10Record() reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, #dense_keys=['image_raw', 'label'], #dense_types=[tf.string, tf.int64] features={'image_raw': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64)} ) image = tf.decode_raw(features['image_raw'], tf.uint8) image.set_shape([input_image_size * input_image_size * input_image_channels]) image = tf.cast(image, tf.float32) result.image = tf.reshape(image, [input_image_size, input_image_size, input_image_channels]) label = tf.cast(features['label'], tf.int32) result.label = tf.sparse_to_dense(label, [num_labels], 1.0, 0.0) return result
def loss_test(logits, labels, batch_size=None): # Reshape the labels into a dense Tensor of # shape [FLAGS.batch_size, num_classes]. sparse_labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(batch_size), [batch_size, 1]) sparse_labels = tf.cast(sparse_labels, tf.int32) concated = tf.concat(1, [indices, sparse_labels]) num_classes = logits[0].get_shape()[-1].value dense_labels = tf.sparse_to_dense(concated, [batch_size, num_classes], 1.0, 0.0) print "-"*10 print type(logits) print len(logits) print logits[0].get_shape() print logits[1].get_shape() print "-"*10 # Cross entropy loss for the main softmax prediction. loss = slim.losses.cross_entropy_loss_without_collection(logits[0], dense_labels, label_smoothing=0.1, weight=1.0) # Cross entropy loss for the auxiliary softmax head. aux_loss = slim.losses.cross_entropy_loss_without_collection(logits[1], dense_labels, label_smoothing=0.1, weight=0.4, scope='aux_loss') return loss, aux_loss
def __init__(self, is_training, config): self._batch_size = batch_size = FLAGS.batch_size self.num_skills = num_skills = config.num_skills self.hidden_size = size = FLAGS.hidden_size self.num_steps = num_steps = config.num_steps input_size = num_skills*2 inputs = self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._target_id = target_id = tf.placeholder(tf.int32, [None]) self._target_correctness = target_correctness = tf.placeholder(tf.float32, [None]) final_hidden_size = size hidden_layers = [] for i in range(FLAGS.hidden_layer_num): final_hidden_size = size/(i+1) hidden1 = tf.nn.rnn_cell.LSTMCell(final_hidden_size, state_is_tuple=True) if is_training and config.keep_prob < 1: hidden1 = tf.nn.rnn_cell.DropoutWrapper(hidden1, output_keep_prob=FLAGS.keep_prob) hidden_layers.append(hidden1) cell = tf.nn.rnn_cell.MultiRNNCell(hidden_layers, state_is_tuple=True) input_data = tf.reshape(self._input_data, [-1]) #one-hot encoding with tf.device("/cpu:0"): labels = tf.expand_dims(input_data, 1) indices = tf.expand_dims(tf.range(0, batch_size*num_steps, 1), 1) concated = tf.concat(1, [indices, labels]) inputs = tf.sparse_to_dense(concated, tf.pack([batch_size*num_steps, input_size]), 1.0, 0.0) inputs.set_shape([batch_size*num_steps, input_size]) # [batch_size, num_steps, input_size] inputs = tf.reshape(inputs, [-1, num_steps, input_size]) x = tf.transpose(inputs, [1, 0, 2]) # Reshape to (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, input_size]) # Split to get a list of 'n_steps' # tensors of shape (doc_num, n_input) x = tf.split(0, num_steps, x) #inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] #outputs, state = tf.nn.rnn(hidden1, x, dtype=tf.float32) outputs, state = tf.nn.rnn(cell, x, dtype=tf.float32) output = tf.reshape(tf.concat(1, outputs), [-1, final_hidden_size]) # calculate the logits from last hidden layer to output layer sigmoid_w = tf.get_variable("sigmoid_w", [final_hidden_size, num_skills]) sigmoid_b = tf.get_variable("sigmoid_b", [num_skills]) logits = tf.matmul(output, sigmoid_w) + sigmoid_b # from output nodes to pick up the right one we want logits = tf.reshape(logits, [-1]) selected_logits = tf.gather(logits, self.target_id) #make prediction self._pred = self._pred_values = pred_values = tf.sigmoid(selected_logits) # loss function loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(selected_logits, target_correctness)) #self._cost = cost = tf.reduce_mean(loss) self._cost = cost = loss
def softmax_loss_layer(name, score_bottom, label_bottom): """ Calculates cumulative Softmax Cross Entropy Loss along the last dimension *This function does not divide the loss by batch size* Once tensorflow has SparseCrossEntropy function, this one will be replaced """ # Check shape score_shape = score_bottom.get_shape().as_list() label_shape = label_bottom.get_shape().as_list() assert len(score_shape) == len(label_shape) + 1 assert score_shape[:-1] == label_shape # Compute the outer dimensions dimensions in label inner_dim = score_shape[-1] outer_dim = 1 for d in label_shape: outer_dim *= d # flatten score and label flat_score = tf.reshape(score_bottom, [outer_dim, inner_dim]) flat_label = tf.reshape(label_bottom, [outer_dim, 1]) # Reshape the labels into a dense Tensor of # shape [batch_size, NUM_CLASSES]. sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1]) indices = tf.reshape(tf.range(FLAGS.batch_size), [FLAGS.batch_size, 1]) concated = tf.concat(1, [indices, sparse_labels]) dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES], 1.0, 0.0)
def default_exchange_proposed_fn_(num_replica, seed=None): """Default function for `exchange_proposed_fn` of `kernel`.""" num_replica = tf.to_int32(num_replica) seed = distributions_util.gen_new_seed(seed, 'default_exchange_proposed_fn') random_uniform = tf.random_uniform([], seed=seed) accept_proposed_exchange = random_uniform < probs seed = distributions_util.gen_new_seed(seed, 'default_exchange_proposed_fn') zero_start = tf.random_uniform([], seed=seed) > 0.5 if num_replica % 2 == 0: exchange_proposed = tf.where( zero_start, tf.range(num_replica), tf.sparse_to_dense(tf.range(num_replica - 2), (num_replica,), tf.range(1, num_replica - 1))) exchange_proposed_n = tf.where(zero_start, num_replica // 2, num_replica // 2 - 1) else: exchange_proposed = tf.where( zero_start, tf.range(num_replica - 1), tf.range(1, num_replica)) exchange_proposed_n = num_replica // 2 exchange_proposed = tf.reshape(exchange_proposed, (num_replica // 2, 2)) exchange_proposed = tf.where(accept_proposed_exchange, exchange_proposed, tf.zeros_like(exchange_proposed)) exchange_proposed_n = tf.where(accept_proposed_exchange, exchange_proposed_n, tf.zeros_like(exchange_proposed_n)) return exchange_proposed, exchange_proposed_n
def body(i, next_replica_idx): """`tf.while_loop` body.""" ratio = ( sampled_replica_ratios[next_replica_idx[exchange_proposed[i, 0]]] - sampled_replica_ratios[next_replica_idx[exchange_proposed[i, 1]]]) ratio *= ( self.inverse_temperatures[exchange_proposed[i, 1]] - self.inverse_temperatures[exchange_proposed[i, 0]]) self._seed_stream = distributions_util.gen_new_seed( self._seed_stream, salt='replica_exchange_one_step') log_uniform = tf.log(tf.random_uniform( shape=tf.shape(ratio), dtype=ratio.dtype.base_dtype, seed=self._seed_stream)) exchange = log_uniform < ratio exchange_op = tf.sparse_to_dense( [exchange_proposed[i, 0], exchange_proposed[i, 1]], [self.num_replica], [next_replica_idx[exchange_proposed[i, 1]] - next_replica_idx[exchange_proposed[i, 0]], next_replica_idx[exchange_proposed[i, 0]] - next_replica_idx[exchange_proposed[i, 1]]]) next_replica_idx = tf.cond(exchange, lambda: next_replica_idx + exchange_op, lambda: next_replica_idx) return [i + 1, next_replica_idx]
def loss(logits, labels, batch_size=None): """Adds all losses for the model. Note the final loss is not returned. Instead, the list of losses are collected by slim.losses. The losses are accumulated in tower_loss() and summed to calculate the total loss. Args: logits: List of logits from inference(). Each entry is a 2-D float Tensor. labels: Labels from distorted_inputs or inputs(). 1-D tensor of shape [batch_size] batch_size: integer """ if not batch_size: batch_size = FLAGS.batch_size # Reshape the labels into a dense Tensor of # shape [FLAGS.batch_size, num_classes]. sparse_labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(batch_size), [batch_size, 1]) concated = tf.concat(1, [indices, sparse_labels]) num_classes = logits[0].get_shape()[-1].value dense_labels = tf.sparse_to_dense(concated, [batch_size, num_classes], 1.0, 0.0) # Cross entropy loss for the main softmax prediction. slim.losses.cross_entropy_loss(logits[0], dense_labels, label_smoothing=0.1, weight=1.0)
def disable_some_fgs(): # We want to delete a randomly-selected subset of fg_inds of # size `fg_inds.shape[0] - max_fg`. # We shuffle along the dimension 0 and then we get the first # num_fg_inds - max_fg indices and we disable them. shuffled_inds = tf.random_shuffle(fg_inds, seed=self._seed) disable_place = (tf.shape(fg_inds)[0] - max_fg) # This function should never run if num_fg_inds <= max_fg, so we # add an assertion to catch the wrong behaviour if it happens. integrity_assertion = tf.assert_positive( disable_place, message="disable_place in disable_some_fgs is negative." ) with tf.control_dependencies([integrity_assertion]): disable_inds = shuffled_inds[:disable_place] is_disabled = tf.sparse_to_dense( sparse_indices=disable_inds, sparse_values=True, default_value=False, output_shape=tf.cast(proposals_label_shape, tf.int64), # We are shuffling the indices, so they may not be ordered. validate_indices=False ) return tf.where( condition=is_disabled, # We set it to -label for debugging purposes. x=tf.negative(proposals_label), y=proposals_label )
def one_hot_mask(labels, num_classes, scope=None): """Compute 1-hot encodings for masks. Given a label image, this computes the one hot encoding at each pixel. Args: labels: (batch_size, width, height, 1) tensor containing labels. num_classes: number of classes scope: optional scope name Returns: Tensor of shape (batch_size, width, height, num_classes) with a 1-hot encoding. """ with tf.name_scope(scope, "OneHotMask", [labels]): height, width, depth = _shape(labels) assert depth == 1 sparse_labels = tf.to_int32(tf.reshape(labels, [-1, 1])) sparse_size, _ = _shape(sparse_labels) indices = tf.reshape(tf.range(0, sparse_size, 1), [-1, 1]) concated = tf.concat_v2([indices, sparse_labels], 1) dense_result = tf.sparse_to_dense(concated, [sparse_size, num_classes], 1.0, 0.0) result = tf.reshape(dense_result, [height, width, num_classes]) return result
def ce(model, config, scope, connect, threshold = 1e-5): with tf.variable_scope(scope), tf.name_scope(scope): with tf.variable_scope('inputs'), tf.name_scope('inputs'): model['%s_in0length' %scope] = model['%s_out0length' %connect] model['%s_in1length' %scope] = model['%s_out1length' %connect] model['%s_in2length' %scope] = model['%s_out2length' %connect] model['%s_maxin2length' %scope] = model['%s_maxout2length' %connect] model['%s_inputs' %scope] = tf.clip_by_value(tf.nn.softmax(model['%s_outputs' %connect]), threshold, 1. - threshold, name = '%s_inputs' %scope) model['%s_out0length' %scope] = model['%s_in0length' %scope] model['%s_out1length' %scope] = model['%s_in1length' %scope] model['%s_out2length' %scope] = tf.placeholder(tf.int32, [model['%s_in0length' %scope]], '%s_out2length' %scope) model['%s_maxout2length' %scope] = model['%s_maxin2length' %scope] with tf.variable_scope('labels'), tf.name_scope('labels'): model['%s_labels_len' %scope] = tf.placeholder(tf.int32, [model['%s_in0length' %scope]], '%s_labels_len' %scope) model['%s_labels_ind' %scope] = tf.placeholder(tf.int64, [None, 2], '%s_labels_ind' %scope) model['%s_labels_val' %scope] = tf.placeholder(tf.int32, [None], '%s_labels_val' %scope) model['%s_labels_collapsed' %scope] = tf.sparse_to_dense(model['%s_labels_ind' %scope], [model['%s_maxin2length' %scope], model['%s_in0length' %scope]], model['%s_labels_val' %scope], -1, name = '%s_labels_collapsed' %scope) model['%s_labels' %scope] = tf.one_hot(model['%s_labels_collapsed' %scope], model['%s_out1length' %scope], name = '%s_labels' %scope) with tf.variable_scope('loss'), tf.name_scope('loss'): model['%s_loss' %scope] = tf.reduce_sum(-tf.multiply(model['%s_labels' %scope], tf.log(model['%s_inputs' %scope])), name = '%s_loss' %scope) with tf.variable_scope('outputs'), tf.name_scope('outputs'): model['%s_output' %scope] = model['%s_inputs' %scope] return model
def f(X): """ prob: n probabilities box: nx4 boxes Returns: n boolean, the selection """ prob, box = X output_shape = tf.shape(prob) # filter by score threshold ids = tf.reshape(tf.where(prob > cfg.TEST.RESULT_SCORE_THRESH), [-1]) prob = tf.gather(prob, ids) box = tf.gather(box, ids) # NMS within each class selection = tf.image.non_max_suppression( box, prob, cfg.TEST.RESULTS_PER_IM, cfg.TEST.FRCNN_NMS_THRESH) selection = tf.to_int32(tf.gather(ids, selection)) # sort available in TF>1.4.0 # sorted_selection = tf.contrib.framework.sort(selection, direction='ASCENDING') sorted_selection = -tf.nn.top_k(-selection, k=tf.size(selection))[0] mask = tf.sparse_to_dense( sparse_indices=sorted_selection, output_shape=output_shape, sparse_values=True, default_value=False) return mask
def ced(model, config, scope, connect, threshold = 1e-5): with tf.variable_scope(scope), tf.name_scope(scope): with tf.variable_scope('inputs'), tf.name_scope('inputs'): model['%s_in0length' %scope] = model['%s_out0length' %connect] model['%s_in1length' %scope] = model['%s_out1length' %connect] model['%s_in2length' %scope] = model['%s_out2length' %connect] model['%s_maxin2length' %scope] = model['%s_maxout2length' %connect] model['%s_inputs' %scope] = tf.clip_by_value(model['%s_outputs' %connect], threshold, 1. - threshold, name = '%s_inputs' %scope) model['%s_out0length' %scope] = model['%s_in0length' %scope] model['%s_out1length' %scope] = model['%s_in1length' %scope] model['%s_out2length' %scope] = tf.placeholder(tf.int32, [model['%s_in0length' %scope]], '%s_out2length' %scope) model['%s_maxout2length' %scope] = model['%s_maxin2length' %scope] with tf.variable_scope('labels'), tf.name_scope('labels'): model['%s_labels_len' %scope] = tf.placeholder(tf.int32, [model['%s_in0length' %scope]], '%s_labels_len' %scope) model['%s_labels_ind' %scope] = tf.placeholder(tf.int64, [None, 3], '%s_labels_ind' %scope) model['%s_labels_val' %scope] = tf.placeholder(tf.float32, [None], '%s_labels_val' %scope) model['%s_labels' %scope] = tf.sparse_to_dense(model['%s_labels_ind' %scope], [model['%s_in0length' %scope], model['%s_maxin2length' %scope], model['%s_maxin2length' %scope]], model['%s_labels_val' %scope], -1, name = '%s_labels' %scope) with tf.variable_scope('loss'), tf.name_scope('loss'): model['%s_loss' %scope] = tf.reduce_sum(tf.where(tf.less(model['%s_labels' %scope], tf.zeros([model['%s_in0length' %scope], model['%s_maxin2length' %scope], model['%s_maxin2length' %scope]], tf.float32)), tf.zeros([model['%s_in0length' %scope], model['%s_maxin2length' %scope], model['%s_maxin2length' %scope]], tf.float32), -tf.add(tf.multiply(model['%s_labels' %scope], tf.log(model['%s_inputs' %scope])), tf.multiply(tf.subtract(1., model['%s_labels' %scope]), tf.log(tf.subtract(1., model['%s_inputs' %scope]))))), name = '%s_loss' %scope) with tf.variable_scope('outputs'), tf.name_scope('outputs'): model['%s_output' %scope] = model['%s_inputs' %scope] return model
def loss(logits, labels): """Calculates the loss from the logits and the labels. Args: logits: Logits tensor, float - [batch_size, NUM_CLASSES]. labels: Labels tensor, int32 - [batch_size]. Returns: loss: Loss tensor of type float. """ # Convert from sparse integer labels in the range [0, NUM_CLASSES) # to 1-hot dense float vectors (that is we will have batch_size vectors, # each with NUM_CLASSES values, all of which are 0.0 except there will # be a 1.0 in the entry corresponding to the label). batch_size = tf.size(labels) labels = tf.expand_dims(labels, 1) indices = tf.expand_dims(tf.range(0, batch_size), 1) concated = tf.concat(1, [indices, labels]) onehot_labels = tf.sparse_to_dense( concated, tf.pack([batch_size, NUM_CLASSES]), 1.0, 0.0) cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels, name='xentropy') loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') return loss
def loss(logits, labels): """Add L2Loss to all the trainable variables. Add summary for for "Loss" and "Loss/avg". Args: logits: Logits from inference(). labels: Labels from distorted_inputs or inputs(). 1-D tensor of shape [batch_size] Returns: Loss tensor of type float. """ # Reshape the labels into a dense Tensor of # shape [batch_size, NUM_CLASSES]. sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1]) indices = tf.reshape(tf.range(0, FLAGS.batch_size, 1), [FLAGS.batch_size, 1]) concated = tf.concat(1, [indices, sparse_labels]) dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES], 1.0, 0.0) # Calculate the average cross entropy loss across the batch. cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits, dense_labels, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # The total loss is defined as the cross entropy loss plus all of the weight # decay terms (L2 loss). return tf.add_n(tf.get_collection('losses'), name='total_loss')
def _testGraphExtensionRestore(self): test_dir = os.path.join(self.get_temp_dir(), "graph_extension") filename = os.path.join(test_dir, "metafile") saver0_ckpt = os.path.join(test_dir, "saver0.ckpt") with self.test_session(graph=tf.Graph()) as sess: # Restores from MetaGraphDef. new_saver = tf.train.import_meta_graph(filename) # Generates a new MetaGraphDef. new_saver.export_meta_graph() # Restores from checkpoint. new_saver.restore(sess, saver0_ckpt) # Addes loss and train. labels = tf.constant(0, tf.int32, shape=[100], name="labels") batch_size = tf.size(labels) labels = tf.expand_dims(labels, 1) indices = tf.expand_dims(tf.range(0, batch_size), 1) concated = tf.concat(1, [indices, labels]) onehot_labels = tf.sparse_to_dense( concated, tf.pack([batch_size, 10]), 1.0, 0.0) logits = tf.get_collection("logits")[0] cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels, name="xentropy") loss = tf.reduce_mean(cross_entropy, name="xentropy_mean") tf.scalar_summary(loss.op.name, loss) # Creates the gradient descent optimizer with the given learning rate. optimizer = tf.train.GradientDescentOptimizer(0.01) # Runs train_op. train_op = optimizer.minimize(loss) sess.run(train_op)
def one_hot_matrix(tensor_in, num_classes, on_value=1.0, off_value=0.0): """Encodes indices from given tensor as one-hot tensor. TODO(ilblackdragon): Ideally implementation should be part of TensorFlow with Eigen-native operation. Args: tensor_in: Input tensor of shape [N1, N2]. num_classes: Number of classes to expand index into. on_value: Tensor or float, value to fill-in given index. off_value: Tensor or float, value to fill-in everything else. Returns: Tensor of shape [N1, N2, num_classes] with 1.0 for each id in original tensor. """ tensor_in = tf.convert_to_tensor(tensor_in) sparse_values = tf.to_int64(tf.reshape(tensor_in, [-1, 1])) size = tf.shape(sparse_values)[0] dims = tf.shape(tensor_in) indices = tf.to_int64(tf.reshape(tf.range(0, size), [-1, 1])) indices_values = tf.concat(1, [indices, sparse_values]) outshape = tf.to_int64(expand_concat(0, [size, num_classes])) one_hot_vector = tf.sparse_to_dense(indices_values, outshape, on_value, off_value) ret = tf.reshape(one_hot_vector, tf.concat(0, [dims, [num_classes]])) ret.set_shape(tensor_in.get_shape().concatenate(num_classes)) return ret
def loss(logits, labels): #sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1]) #indices = tf.reshape(tf.range(0, FLAGS.batch_size), [FLAGS.batch_size, 1]) labels = tf.expand_dims(labels, 1) indices = tf.expand_dims(tf.range(0, FLAGS.batch_size, 1), 1) #concated = tf.concat(1, [indices, sparse_labels]) concated = tf.concat(1, [indices, labels]) # sparse_to_dense のクラス数は クラスラベルの最大値+1 とすること dense_labels = tf.sparse_to_dense( concated, [FLAGS.batch_size, NUM_CLASSES], 1.0, 0.0 ) cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits, dense_labels, name='cross_entropy_per_example' ) cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) return tf.add_n(tf.get_collection('losses'), name='total_loss')
def _count_matrix_input(self, filenames, submatrix_rows, submatrix_cols): """Creates ops that read submatrix shards from disk.""" random.shuffle(filenames) filename_queue = tf.train.string_input_producer(filenames) reader = tf.WholeFileReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, features={ 'global_row': tf.FixedLenFeature([submatrix_rows], dtype=tf.int64), 'global_col': tf.FixedLenFeature([submatrix_cols], dtype=tf.int64), 'sparse_local_row': tf.VarLenFeature(dtype=tf.int64), 'sparse_local_col': tf.VarLenFeature(dtype=tf.int64), 'sparse_value': tf.VarLenFeature(dtype=tf.float32) }) global_row = features['global_row'] global_col = features['global_col'] sparse_local_row = features['sparse_local_row'].values sparse_local_col = features['sparse_local_col'].values sparse_count = features['sparse_value'].values sparse_indices = tf.concat( axis=1, values=[tf.expand_dims(sparse_local_row, 1), tf.expand_dims(sparse_local_col, 1)]) count = tf.sparse_to_dense(sparse_indices, [submatrix_rows, submatrix_cols], sparse_count) return global_row, global_col, count
def _sparse_to_dense(labels, num_classes): sparse_labels = tf.reshape(labels, [-1, 1]) batch_size = sparse_labels.get_shape().as_list()[0] indices = tf.reshape(tf.range(batch_size), [batch_size, 1]) concated = tf.concat(1, [indices, sparse_labels]) dense_labels = tf.sparse_to_dense(concated, [batch_size, num_classes], 1.0, 0.0) return dense_labels
def _create_network(self): # Initialize autoencode network weights and biases network_weights = self._initialize_weights(**self.network_architecture) start_token_tensor = tf.constant( (np.zeros([self.batch_size, binary_dim])).astype(np.float32), dtype=tf.float32) self.network_weights = network_weights seqlen = tf.cast(tf.reduce_sum(self.mask, reduction_indices=-1), tf.int32) self.embedded_input_KLD_loss = tf.constant(0.0) self.input_embedding_KLD_loss = tf.constant(0.0) # def train_encoder(): embedded_input, self.embedded_input_KLD_loss = self._get_word_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['input_meaning'], tf.reshape( self.caption_placeholder, [self.batch_size * self.network_architecture['maxlen']]), logit=True) print 'eshape', embedded_input.shape embedded_input = tf.reshape(embedded_input, [ self.batch_size, self.network_architecture['maxlen'], self.network_architecture['n_lstm_input'] ]) print embedded_input.shape if not vanilla: self.embedded_input_KLD_loss = tf.reshape( embedded_input_KLD_loss, [-1, self.network_architecture['maxlen']])[:, 1:] encoder_input = embedded_input[:, 1:, :] cell = tf.contrib.rnn.BasicLSTMCell( self.network_architecture['n_lstm_input']) if lstm_stack > 1: cell = tf.contrib.rnn.MultiRNNCell([cell] * lstm_stack) if not use_bdlstm: encoder_outs, encoder_states = rnn.dynamic_rnn( cell, encoder_input, sequence_length=seqlen - 1, dtype=tf.float32, time_major=False) else: backward_cell = tf.contrib.rnn.BasicLSTMCell( self.network_architecture['n_lstm_input']) if lstm_stack > 1: backward_cell = tf.contrib.rnn.MultiRNNCell([backward_cell] * lstm_stack) encoder_outs, encoder_states = rnn.bidirectional_dynamic_rnn( cell, backward_cell, encoder_input, sequence_length=seqlen - 1, dtype=tf.float32, time_major=False) ix_range = tf.range(0, self.batch_size, 1) ixs = tf.expand_dims(ix_range, -1) to_cat = tf.expand_dims(seqlen - 2, -1) gather_inds = tf.concat([ixs, to_cat], axis=-1) print encoder_outs outs = tf.gather_nd(encoder_outs, gather_inds) # outs=tf.nn.dropout(outs,.75) self.deb = tf.gather_nd(self.caption_placeholder[:, 1:], gather_inds) print outs.shape input_embedding, self.input_embedding_KLD_loss = self._get_middle_embedding( [ network_weights['middle_encoding'], network_weights['biases_middle_encoding'] ], network_weights['middle_encoding'], outs, logit=True) # return input_embedding # input_embedding=tf.nn.l2_normalize(input_embedding,dim=-1) self.other_loss = tf.constant(0, dtype=tf.float32) KLD_penalty = (tf.cast(self.timestep, tf.float32) / 1.0) * 1e-3 cos_penalty = tf.maximum(-0.1, (tf.cast(self.timestep, tf.float32) / (5.0))) * 1e-3 self.input_KLD_loss = tf.constant(0.0) # def train_decoder(): if form3: _x, self.input_KLD_loss = self._get_input_embedding([ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['variational_encoding']) self.input_KLD_loss = tf.reduce_mean( self.input_KLD_loss ) * KLD_penalty #\*tf.constant(0.0,dtype=tf.float32) # normed_embedding= tf.nn.l2_normalize(self.mid_var, dim=-1) # normed_target=tf.nn.l2_normalize(self.word_var,dim=-1) # cos_sim=(tf.reduce_sum(tf.multiply(normed_embedding,normed_target),axis=-1)) # # # self.exp_loss=tf.reduce_mean((-cos_sim)) # # # self.exp_loss=tf.reduce_sum(xentropy)/float(self.batch_size) # self.other_loss += tf.reduce_mean(1-(cos_sim))*cos_penalty # # other_loss+=tf.reduce_mean(tf.reduce_sum(tf.square(_x-input_embedding),axis=-1))*cos_penalty # _x=tf.concat([input_embedding,_x],axis=-1) # tempe=tf.Variable(xavier_init(self.network_architecture['n_lstm_input']*2,self.network_architecture['n_lstm_input']),name='emb_cat') # tempb=tf.Variable(tf.zeros([self.network_architecture['n_lstm_input']]),name='emb_cat_b') # _x=tf.matmul(_x,tempe)+tempb # input_embedding=_x # input_embedding=tf.cond(tf.equal(self.timestep%5,0),train_decoder,train_encoder) # Use recognition network to determine mean and # (log) variance of Gaussian distribution in latent # space # if not same_embedding: # input_embedding,input_embedding_KLD_loss=self._get_input_embedding([network_weights['variational_encoding'],network_weights['biases_variational_encoding']],network_weights['input_meaning']) # else: # input_embedding,input_embedding_KLD_loss=self._get_input_embedding([network_weights['variational_encoding'],network_weights['biases_variational_encoding']],network_weights['LSTM']) # if not embeddings_trainable: # input_embedding=tf.stop_gradient(input_embedding) # embed2decoder=tf.Variable(xavier_init(self.network_architecture['n_z_m_2'],self.network_architecture['n_lstm_input']),name='decoder_embedding_weight') # embed2decoder_bias=tf.Variable(tf.zeros(self.network_architecture['n_lstm_input']),name='decoder_embedding_bias') state = self.lstm.zero_state(self.batch_size, dtype=tf.float32) # input_embedding=tf.matmul(input_embedding,embed2decoder)+embed2decoder_bias loss = 0 self.debug = 0 probs = [] with tf.variable_scope("RNN"): for i in range(self.network_architecture['maxlen']): if i > 0: # current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias if form4: current_embedding, KLD_loss = input_embedding, 0 elif form2: current_embedding, KLD_loss = self._get_word_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['LSTM'], self.caption_placeholder[:, i - 1], logit=True) else: current_embedding, KLD_loss = self._get_word_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['LSTM'], self.caption_placeholder[:, i - 1]) loss += tf.reduce_sum( KLD_loss * self.mask[:, i]) * KLD_penalty else: current_embedding = input_embedding if i > 0: tf.get_variable_scope().reuse_variables() out, state = self.lstm(current_embedding, state) if i > 0: if not form2: labels = tf.expand_dims(self.caption_placeholder[:, i], 1) ix_range = tf.range(0, self.batch_size, 1) ixs = tf.expand_dims(ix_range, 1) concat = tf.concat([ixs, labels], 1) onehot = tf.sparse_to_dense( concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) else: onehot = self.caption_placeholder[:, i] logit = tf.matmul( out, network_weights['LSTM']['encoding_weight'] ) + network_weights['LSTM']['encoding_bias'] if not use_ctc: xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=onehot) xentropy = xentropy * self.mask[:, i] xentropy = tf.reduce_sum(xentropy) self.debug += xentropy loss += xentropy else: probs.append(tf.expand_dims(tf.nn.sigmoid(logit), 1)) self.debug = [ self.input_KLD_loss, tf.reduce_mean(self.input_embedding_KLD_loss) / self.batch_size * KLD_penalty, self.other_loss, KLD_penalty ] if not use_ctc: loss_ctc = 0 # self.debug=other_loss # self.debug=[input_KLD_loss,embedded_input_KLD_loss,input_embedding_KLD_loss] else: probs = tf.concat(probs, axis=1) probs = ctc_loss.get_output_probabilities( probs, self.caption_placeholder[:, 1:, :]) loss_ctc = ctc_loss.loss( probs, self.caption_placeholder[:, 1:, :], self.network_architecture['maxlen'] - 2, self.batch_size, seqlen - 1) self.debug = loss_ctc # loss = (loss / tf.reduce_sum(self.mask[:, 1:])) + tf.reduce_sum( self.input_embedding_KLD_loss ) / self.batch_size * KLD_penalty + tf.reduce_sum( self.embedded_input_KLD_loss * self.mask[:, 1:] ) / tf.reduce_sum( self.mask[:, 1:] ) * KLD_penalty + loss_ctc + self.input_KLD_loss + self.other_loss print 'makin loss' self.loss = loss
def build_model(self, video, video_mask, caption, caption_1, caption_mask): drop_type = tf.placeholder(tf.int32, shape=[]) caption_mask = tf.cast(caption_mask, tf.float32) video_mask = tf.cast(video_mask, tf.float32) # for decoding video = video * tf.constant(feat_scale_factor) video_flat = tf.reshape(video, [-1, self.dim_image]) # (b x nv) x d image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b) # (b x nv) x h image_emb = tf.reshape( image_emb, [self.batch_size, self.n_video_steps, self.dim_hidden ]) # b x nv x h c_init = tf.zeros([self.batch_size, self.dim_hidden]) # b x h m_init = tf.zeros([self.batch_size, self.dim_hidden]) # b x h state2 = (c_init, m_init) # 2 x b x h ######## Encoding Stage ######### # encoding video # mean pooling && mapping into (-1, 1) range output1 = tf.nn.tanh(tf.reduce_mean(image_emb, axis=1)) # b x h # encoding sentence with tf.variable_scope("model") as scope: for i in xrange(self.n_caption_steps): if i > 0: scope.reuse_variables() with tf.variable_scope("LSTM2"): with tf.device(cpu_device): current_embed = tf.nn.embedding_lookup( self.Wemb, caption_1[:, i]) # b x h output2, state2 = self.lstm2_dropout( current_embed, state2) # b x h ######## Encoding Stage ######### #### 0: keep both 1: keep video only 2: keep sentence only ######## Dropout Stage ######### if drop_type == 1: output2 = tf.constant(0, dtype=tf.float32) * output2 output2 = tf.stop_gradient(output2) elif drop_type == 2: output1 = tf.constant(0, dtype=tf.float32) * output1 output1 = tf.stop_gradient(output1) ######## Dropout Stage ######### ######## Semantic Learning Stage ######## ##### normalization before concatenation input_state = tf.concat([output1, output2], 1) # b x (2 * h) loss_latent, output_semantic = self.vae(input_state) ######## Semantic Learning Stage ######## ######## Decoding Stage ########## state3 = (c_init, m_init) # 2 x b x h state4 = (c_init, m_init) # 2 x b x h current_embed = tf.zeros([self.batch_size, self.dim_hidden]) # b x h video_prev = tf.zeros([self.batch_size, self.dim_hidden]) loss_caption = 0.0 loss_video = 0.0 ## decoding sentence without attention with tf.variable_scope("model") as scope: with tf.variable_scope("LSTM3"): _, state3 = self.lstm3_dropout(output_semantic, state3) # b x h for i in xrange(n_caption_steps): scope.reuse_variables() with tf.variable_scope("LSTM3"): output3, state3 = self.lstm3_dropout( current_embed, state3) # b x h labels = tf.expand_dims(caption[:, i], 1) # b x 1 indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) # b x 1 concated = tf.concat([indices, labels], 1) # b x 2 onehot_labels = tf.sparse_to_dense( concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) # b x w with tf.device(cpu_device): current_embed = tf.nn.embedding_lookup( self.Wemb, caption[:, i]) logit_words = tf.nn.xw_plus_b(output3, self.embed_word_W, self.embed_word_b) # b x w cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logit_words, labels=onehot_labels) # b x 1 cross_entropy = cross_entropy * caption_mask[:, i] # b x 1 loss_caption += tf.reduce_sum(cross_entropy) # 1 ## decoding video without attention with tf.variable_scope("model") as scope: ## TODO: add attention for video decoding ## write into memory first with tf.variable_scope("LSTM4"): _, state4 = self.lstm4_dropout(output_semantic, state4) for i in xrange(self.n_video_steps): scope.reuse_variables() with tf.variable_scope("LSTM4"): output4, state4 = self.lstm4_dropout(video_prev, state4) decode_image = tf.nn.xw_plus_b(output4, self.decode_image_W, self.decode_image_b) # b x d_im decode_image = tf.nn.sigmoid(decode_image) video_prev = image_emb[:, i, :] # b x h euclid_loss = tf.reduce_sum(tf.square( tf.subtract(decode_image, video[:, i, :])), axis=1, keep_dims=True) # b x 1 euclid_loss = euclid_loss * video_mask[:, i] # b x 1 loss_video += tf.reduce_sum(euclid_loss) # 1 loss_caption = loss_caption / tf.reduce_sum(caption_mask) loss_video = loss_video / tf.reduce_sum(video_mask) loss = tf.constant(caption_weight) * loss_caption + tf.constant(video_weight) * loss_video + \ tf.constant(latent_weight) * loss_latent return loss, loss_caption, loss_latent, loss_video, output_semantic, output1, output2, drop_type
def _sparse_to_batch(self, sparse): print('shapes', sparse.dense_shape) ids = tf.sparse_tensor_to_dense(sparse) mask = tf.sparse_to_dense(sparse.indices, sparse.dense_shape, tf.ones_like(sparse.values, dtype=tf.int32)) return ids, mask
def __init__( self, source_vocab_size, target_vocab_size, entity_vocab_size, # entity buckets, state_size, num_layers, embedding_size, max_gradient, batch_size, learning_rate, forward_only=False, dtype=tf.float32): entity_encode = 'cnn' highway = True self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.entity_vocab_size = entity_vocab_size # entity self.buckets = buckets self.batch_size = batch_size self.learning_rate = learning_rate self.global_step = tf.Variable(0, trainable=False, name="global_step") self.state_size = state_size self.encoder_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None], name='1') self.decoder_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None], name='2') self.decoder_targets = tf.placeholder(tf.int32, shape=[self.batch_size, None], name='3') self.encoder_len = tf.placeholder(tf.int32, shape=[self.batch_size], name='4') self.decoder_len = tf.placeholder(tf.int32, shape=[self.batch_size], name='5') self.beam_tok = tf.placeholder(tf.int32, shape=[self.batch_size], name='6') self.prev_att = tf.placeholder(tf.float32, shape=[self.batch_size, state_size * 2], name='7') self.K = tf.placeholder(tf.int32) self.lvt_dict = tf.placeholder(tf.int32, shape=[None], name='8') self.lvt_len = tf.placeholder(tf.int32, name='9') self.batch_dec_len = tf.placeholder(tf.int32, name='10') # entity self.entity_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None]) self.entity_len = tf.placeholder(tf.int32, shape=[self.batch_size]) encoder_fw_cells = [] encoder_bw_cells = [] for _ in range(2): encoder_fw_cells.append(tf.contrib.rnn.GRUCell(state_size)) encoder_bw_cells.append(tf.contrib.rnn.GRUCell(state_size)) if not forward_only: for i in range(2): encoder_fw_cells[i] = tf.contrib.rnn.DropoutWrapper( encoder_fw_cells[i], output_keep_prob=0.50) encoder_bw_cells[i] = tf.contrib.rnn.DropoutWrapper( encoder_bw_cells[i], output_keep_prob=0.50) encoder_fw_cell = tf.contrib.rnn.MultiRNNCell(encoder_fw_cells) encoder_bw_cell = tf.contrib.rnn.MultiRNNCell(encoder_bw_cells) #decode decoder_cells = [] for _ in range(2): decoder_cells.append(tf.contrib.rnn.GRUCell(state_size)) decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells) self.loss = tf.constant(0) with tf.variable_scope("seq2seq", dtype=dtype): with tf.variable_scope("encoder"): self.encoder_emb = tf.get_variable( "embedding", [source_vocab_size, embedding_size], initializer=emb_init) encoder_inputs_emb = tf.nn.embedding_lookup( self.encoder_emb, self.encoder_inputs) encoder_fw_cell = tf.contrib.rnn.MultiRNNCell(encoder_fw_cells) encoder_bw_cell = tf.contrib.rnn.MultiRNNCell(encoder_bw_cells) encoder_outputs, encoder_states = \ tf.nn.bidirectional_dynamic_rnn( encoder_fw_cell, encoder_bw_cell, encoder_inputs_emb, sequence_length=self.encoder_len, dtype=dtype) encoder_len = self.encoder_len if forward_only: encoder_outputs = tile_batch(encoder_outputs, multiplier=10) encoder_states = nest.map_structure( lambda s: tile_batch(s, 10), encoder_states) encoder_len = tile_batch(self.encoder_len, multiplier=10) #encoder_states = encoder_states[-1] if entity_encode == 'no': # NO with tf.variable_scope("entity_encoder"): self.entity_emb = tf.get_variable( "embedding", [entity_vocab_size, 1000], initializer=emb_init) entity_vector = tf.nn.embedding_lookup( self.entity_emb, self.entity_inputs) elif entity_encode == 'rnn': # RNN with tf.variable_scope("entity_encoder"): entity_fw_cell = tf.contrib.rnn.GRUCell(state_size) entity_bw_cell = tf.contrib.rnn.GRUCell(state_size) if not forward_only: entity_fw_cell = tf.contrib.rnn.DropoutWrapper( entity_fw_cell, output_keep_prob=0.5) entity_bw_cell = tf.contrib.rnn.DropoutWrapper( entity_bw_cell, output_keep_prob=0.5) self.entity_emb = tf.get_variable( "embedding", [entity_vocab_size, 1000], initializer=emb_init) entity_inputs_emb = tf.nn.embedding_lookup( self.entity_emb, self.entity_inputs) entity_outputs, entity_states = \ tf.nn.bidirectional_dynamic_rnn( entity_fw_cell, entity_bw_cell, entity_inputs_emb, sequence_length=self.entity_len, dtype=dtype) entity_vector = tf.concat(entity_outputs, 2) entity_vector.set_shape( [self.batch_size, None, state_size * 2]) entity_proj = entity_inputs_emb elif entity_encode == 'cnn': # CNN with tf.variable_scope("entity_encoder"): self.entity_emb = tf.get_variable( "embedding", [entity_vocab_size, 1000], initializer=emb_init) entity_inputs_emb = tf.nn.embedding_lookup( self.entity_emb, self.entity_inputs) entity_inputs_emb_expanded = tf.expand_dims( entity_inputs_emb, -1) filter_sizes = [3, 5, 7] num_filters = [400, 300, 300] outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): filter_shape = [ filter_size, 1000, 1, num_filters[i] ] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters[i] ]), name="b") conv = tf.nn.conv2d(entity_inputs_emb_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") erase = int((7 - filter_size) / 2) if erase != 0: h = h[:, erase:-erase, :, :] if not forward_only: h = tf.nn.dropout(h, 0.5) outputs.append(h) entity_vector = tf.concat(outputs, axis=3) entity_vector = tf.squeeze(entity_vector, 2) entity_vector.set_shape( [self.batch_size, None, state_size * 2]) entity_proj = entity_inputs_emb[:, 3:-3, :] if highway: # y # entity_proj = entity_inputs_emb[:,3:-3,:] Wh = tf.get_variable("Wh", [1000, 1000], initializer=xavier_initializer()) bh = tf.Variable(tf.constant(0.0, shape=[1000])) entity_proj = tf.nn.tanh(tf.tensordot(entity_proj, Wh, 1) + bh) if not forward_only: entity_proj.set_shape([self.batch_size, None, 1000]) else: entity_proj.set_shape([self.batch_size * 10, None, 1000]) if not forward_only: entity_proj = tf.nn.dropout(entity_proj, keep_prob=0.5) # t Wt = tf.get_variable("Wt", [1000, 1], initializer=xavier_initializer()) bt = tf.Variable(tf.constant(0.0, shape=[1])) t = tf.nn.sigmoid(tf.tensordot(entity_vector, Wt, 1) + bt) if not forward_only: t.set_shape([self.batch_size, None, 1000]) else: t.set_shape([self.batch_size * 10, None, 1000]) self.t = t entity_vector = t * entity_vector + (1 - t) * entity_proj with tf.variable_scope("init_state"): init_states = [] for i in range(2): init_state = fc_layer(tf.concat(encoder_states[i], 1), state_size) init_states.append(init_state) # the shape of bidirectional_dynamic_rnn is weird # None for batch_size self.init_states = init_states #self.init_state.set_shape([self.batch_size, state_size]) self.att_states = tf.concat(encoder_outputs, 2) #with tf.variable_scope("entity_init_state"): # entity_init_state = fc_layer( # tf.concat(entity_states, 1), state_size) # self.entity_init_state = entity_init_state # self.entity_init_state.set_shape([self.batch_size, state_size]) # self.entity_att_states = tf.concat(entity_outputs, 2) # self.entity_att_states.set_shape([self.batch_size, None, state_size*2]) with tf.variable_scope("entity_attention"): X = tf.get_variable("X", shape=[1000, state_size], initializer=xavier_initializer()) x = tf.get_variable("x", shape=[state_size], initializer=xavier_initializer()) Y = tf.get_variable("Y", shape=[state_size * 2, state_size], initializer=xavier_initializer()) first = tf.matmul(tf.concat(encoder_states[-1], 1), Y) first = tf.expand_dims(first, 1) other = tf.tensordot(entity_vector, X, 1) weights = tf.nn.tanh(first + other) if not forward_only: weights = tf.nn.dropout(weights, keep_prob=0.5) weights = tf.tensordot(weights, x, 1) if not forward_only: weights.set_shape([self.batch_size, None]) else: weights.set_shape([10 * self.batch_size, None]) k_values, k_indices = tf.nn.top_k(weights, k=self.K) my_range = tf.expand_dims(tf.range(0, k_indices.shape[0]), 1) #print(my_range) my_range_repeated = tf.tile(my_range, [1, self.K]) full_indices = tf.concat([ tf.expand_dims(my_range_repeated, 2), tf.expand_dims(k_indices, 2) ], 2) full_indices = tf.reshape(full_indices, [-1, 2]) output_shape = tf.shape(weights) zeros = tf.sparse_to_dense(full_indices, output_shape, 0.0, default_value=-1000000000.0, validate_indices=False) weights = tf.nn.softmax(zeros + weights) weights = tf.expand_dims(weights, -1) self.weights = weights context = tf.multiply(entity_vector, weights) context = tf.reduce_sum(context, axis=1) with tf.variable_scope("attention"): attention = BahdanauAttention(state_size, self.att_states, encoder_len) with tf.variable_scope("decoder") as scope: #decoder_cells = [] #for _ in range(2): # decoder_cells.append(tf.contrib.rnn.GRUCell(state_size)) if not forward_only: for i in range(2): decoder_cells[i] = tf.contrib.rnn.DropoutWrapper( decoder_cells[i], output_keep_prob=0.50) #for i in range(2): decoder_cells[-1] = AttentionWrapper(decoder_cells[-1], attention, state_size, context=context) initial_states = [state for state in init_states] if not forward_only: initial_states[-1] = decoder_cells[-1].zero_state( batch_size=self.batch_size, dtype=tf.float32) else: initial_states[-1] = decoder_cells[-1].zero_state( batch_size=10 * self.batch_size, dtype=tf.float32) decoder_initial_state = tuple(initial_states) decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells) self.decoder_emb = tf.get_variable( "embedding", [target_vocab_size, embedding_size], initializer=emb_init) output_layer = tf.contrib.keras.layers.Dense( target_vocab_size, name="train_output") if not forward_only: #output_layer = tf.contrib.keras.layers.Dense(target_vocab_size, name="train_output") decoder_inputs_emb = tf.nn.embedding_lookup( self.decoder_emb, self.decoder_inputs) helper = tf.contrib.seq2seq.TrainingHelper( decoder_inputs_emb, self.decoder_len) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, decoder_initial_state, output_layer) outputs, final_state, _ = \ tf.contrib.seq2seq.dynamic_decode(decoder) outputs_logits = tf.identity(outputs.rnn_output) self.outputs = outputs_logits weights = tf.sequence_mask(self.decoder_len, dtype=tf.float32) self.loss_t = weights loss_t = tf.contrib.seq2seq.sequence_loss( outputs_logits, self.decoder_targets, weights, average_across_timesteps=False, average_across_batch=False) self.loss = tf.reduce_sum(loss_t) / self.batch_size params = tf.trainable_variables() opt = tf.train.AdadeltaOptimizer(self.learning_rate, epsilon=1e-6) gradients = tf.gradients(self.loss, params) clipped_gradients, norm = \ tf.clip_by_global_norm(gradients, max_gradient) self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) tf.summary.scalar('loss', self.loss) else: #output_layer = tf.contrib.keras.layers.Dense(target_vocab_size, name="test_output", trainable=True) st_toks = tf.convert_to_tensor([data_util.ID_GO] * self.batch_size, dtype=tf.int32) def embed_proj(inputs): return tf.nn.embedding_lookup(self.decoder_emb, inputs) #decoding_helper = GreedyEmbeddingHelper(start_tokens=st_toks, end_token=data_util.ID_EOS, embedding=embed_and_input_proj) inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=embed_proj, start_tokens=st_toks, end_token=data_util.ID_EOS, initial_state=decoder_initial_state, beam_width=10, output_layer=output_layer) outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, maximum_iterations=100) self.outputs = outputs.predicted_ids[:, :, 0] #self.outputs = tf.transpose(outputs.predicted_ids, [0,2,1]) print(self.outputs) self.saver = tf.train.Saver(tf.global_variables()) #self.saver = tf.train.Saver() self.summary_merge = tf.summary.merge_all()
def __init__(self, is_training, config): self._batch_size = batch_size = FLAGS.batch_size self.num_skills = num_skills = config.num_skills self.hidden_size = size = FLAGS.hidden_size self.num_steps = num_steps = config.num_steps input_size = num_skills * 2 inputs = self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._target_id = target_id = tf.placeholder(tf.int32, [None]) self._target_correctness = target_correctness = tf.placeholder( tf.float32, [None]) final_hidden_size = size hidden_layers = [] for i in range(FLAGS.hidden_layer_num): final_hidden_size = size / (i + 1) hidden1 = tf.nn.rnn_cell.LSTMCell(final_hidden_size, state_is_tuple=True) if is_training and config.keep_prob < 1: hidden1 = tf.nn.rnn_cell.DropoutWrapper( hidden1, output_keep_prob=FLAGS.keep_prob) hidden_layers.append(hidden1) cell = tf.nn.rnn_cell.MultiRNNCell(hidden_layers, state_is_tuple=True) input_data = tf.reshape(self._input_data, [-1]) #one-hot encoding with tf.device("/cpu:0"): labels = tf.expand_dims(input_data, 1) indices = tf.expand_dims(tf.range(0, batch_size * num_steps, 1), 1) concated = tf.concat(1, [indices, labels]) inputs = tf.sparse_to_dense( concated, tf.pack([batch_size * num_steps, input_size]), 1.0, 0.0) inputs.set_shape([batch_size * num_steps, input_size]) # [batch_size, num_steps, input_size] inputs = tf.reshape(inputs, [-1, num_steps, input_size]) x = tf.transpose(inputs, [1, 0, 2]) # Reshape to (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, input_size]) # Split to get a list of 'n_steps' # tensors of shape (doc_num, n_input) x = tf.split(0, num_steps, x) #inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] #outputs, state = tf.nn.rnn(hidden1, x, dtype=tf.float32) outputs, state = tf.nn.rnn(cell, x, dtype=tf.float32) output = tf.reshape(tf.concat(1, outputs), [-1, final_hidden_size]) # calculate the logits from last hidden layer to output layer sigmoid_w = tf.get_variable("sigmoid_w", [final_hidden_size, num_skills]) sigmoid_b = tf.get_variable("sigmoid_b", [num_skills]) logits = tf.matmul(output, sigmoid_w) + sigmoid_b # from output nodes to pick up the right one we want logits = tf.reshape(logits, [-1]) selected_logits = tf.gather(logits, self.target_id) #make prediction self._pred = self._pred_values = pred_values = tf.sigmoid( selected_logits) # loss function loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(selected_logits, target_correctness)) #self._cost = cost = tf.reduce_mean(loss) self._cost = cost = loss
def parse_tfrecord_function(example_proto): totalTags = 1930 majorVal = 0.9 defaultVal = 0.1 / (totalTags - 1) features = { "target": tf.FixedLenFeature([], tf.int64, default_value=0), "target_orgId": tf.FixedLenFeature([], tf.int64, default_value=0), "gender": tf.FixedLenFeature([], tf.int64, default_value=0), "age": tf.FixedLenFeature([], tf.int64, default_value=0), "location": tf.FixedLenFeature([], tf.int64, default_value=0), "education_schools": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "education_degrees": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "education_starts": tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True, default_value=0), "education_majors": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "work_expr_descs": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "work_expr_orgs": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "work_expr_orgIds": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "work_expr_starts": tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True, default_value=0), "work_expr_durations": tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True, default_value=0), "work_expr_jobs": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), "proj_expr_descs": tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True, default_value=0), } parsed_features = tf.parse_single_example(example_proto, features) target = parsed_features["target"] target_orgId = parsed_features["target_orgId"] targets = tf.sparse_to_dense(target, [totalTags], majorVal, defaultVal) gender = parsed_features["gender"] age = parsed_features["age"] location = parsed_features["location"] # education part education_schools = parsed_features["education_schools"] education_schools.set_shape([3]) education_degrees = parsed_features["education_degrees"] education_degrees.set_shape([3]) education_starts = parsed_features["education_starts"] education_starts.set_shape([3]) education_majors = parsed_features["education_majors"] education_majors.set_shape([3]) # working experience part work_expr_orgs = parsed_features["work_expr_orgs"] work_expr_orgs.set_shape([3]) work_expr_starts = parsed_features["work_expr_starts"] work_expr_starts.set_shape([3]) work_expr_durations = parsed_features["work_expr_durations"] work_expr_durations.set_shape([3]) work_expr_orgIds = parsed_features["work_expr_orgIds"] work_expr_orgIds.set_shape([3]) work_expr_jobs = parsed_features["work_expr_jobs"] work_expr_jobs.set_shape([3]) work_expr_descs = parsed_features["work_expr_descs"] work_expr_descs.set_shape([360]) work_expr_descs = tf.reshape(work_expr_descs, [3, 120]) proj_expr_descs = parsed_features["proj_expr_descs"] proj_expr_descs.set_shape([360]) proj_expr_descs = tf.reshape(proj_expr_descs, [3, 120]) return target, targets, gender, age, location, education_schools, education_degrees, education_starts, education_majors, work_expr_orgs, work_expr_starts, work_expr_durations, work_expr_jobs, work_expr_orgIds, work_expr_descs, proj_expr_descs
def build_input(flags, mode): image_size = flags.image_size batch_size = flags.batch_size num_classes = flags.num_label if mode == 'train': data_path = flags.train_filepath else: data_path = flags.valid_filepath label_bytes = 1 label_offset = 0 depth = 3 image_bytes = image_size * image_size * depth record_bytes = label_bytes + label_offset + image_bytes data_files = tf.gfile.Glob(data_path) file_queue = tf.train.string_input_producer(data_files, shuffle=True) # Read examples from files in the filename queue. reader = tf.FixedLengthRecordReader(record_bytes=record_bytes) _, value = reader.read(file_queue) # Convert these examples to dense labels and processed images. record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes]) label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32) # Convert from string to [depth * height * width] to [depth, height, width]. depth_major = tf.reshape(tf.slice(record, [label_offset + label_bytes], [image_bytes]), [depth, image_size, image_size]) # Convert from [depth, height, width] to [height, width, depth]. image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32) if mode == 'train': image = tf.image.resize_image_with_crop_or_pad( image, image_size+4, image_size+4) image = tf.random_crop(image, [image_size, image_size, 3]) image = tf.image.random_flip_left_right(image) # Brightness/saturation/constrast provides small gains .2%~.5% on cifar. # image = tf.image.random_brightness(image, max_delta=63. / 255.) # image = tf.image.random_saturation(image, lower=0.5, upper=1.5) # image = tf.image.random_contrast(image, lower=0.2, upper=1.8) image = tf.image.per_image_standardization(image) example_queue = tf.RandomShuffleQueue( capacity=16 * batch_size, min_after_dequeue=8 * batch_size, dtypes=[tf.float32, tf.int32], shapes=[[image_size, image_size, depth], [1]]) num_threads = 16 else: image = tf.image.resize_image_with_crop_or_pad( image, image_size, image_size) image = tf.image.per_image_standardization(image) example_queue = tf.FIFOQueue( 3 * batch_size, dtypes=[tf.float32, tf.int32], shapes=[[image_size, image_size, depth], [1]]) num_threads = 1 example_enqueue_op = example_queue.enqueue([image, label]) tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner( example_queue, [example_enqueue_op] * num_threads)) # Read 'batch' labels + images from the example queue. images, labels = example_queue.dequeue_many(batch_size) labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) labels = tf.sparse_to_dense( tf.concat(values=[indices, labels], axis=1), [batch_size, num_classes], 1.0, 0.0) assert len(images.get_shape()) == 4 assert images.get_shape()[0] == batch_size assert images.get_shape()[-1] == 3 assert len(labels.get_shape()) == 2 assert labels.get_shape()[0] == batch_size assert labels.get_shape()[1] == num_classes # Display the training images in the visualizer. tf.summary.image('images', images) return images, labels
def build_input(data_path, batch_size, num_class, reszie, mode='train'): #读取一个文件夹下匹配的文件 files = tf.train.match_filenames_once(data_path) #把文件放入文件队列中 filename_queue = tf.train.string_input_producer(files, shuffle=True) # #创建一个reader, reader = tf.TFRecordReader() # #从文件中读取一个样例。也可以使用read_up_to函数一次性读取多个样例 _, serialized_example = reader.read(filename_queue) # #解析一个样本 features = tf.parse_single_example(serialized_example, features={ "image/encoded": tf.FixedLenFeature([], tf.string), "image/height": tf.FixedLenFeature([], tf.int64), "image/width": tf.FixedLenFeature([], tf.int64), "image/filename": tf.FixedLenFeature([], tf.string), "image/class/label": tf.FixedLenFeature([], tf.int64), 'image/channels': tf.FixedLenFeature([], tf.int64), }) # 组合样例中队列最多可以存储的样例个数 capacity = 500 + 3 * batch_size #读取一个样例中的特征 image, label = features['image/encoded'], features['image/class/label'] height, width, channel = features['image/height'], features[ 'image/width'], features['image/channels'] # #tf.decode_raw 可以将字符串解析成图像对应的像素数组 # decoded_images=tf.decode_raw(image,tf.uint8) # retyped_images = tf.cast(decoded_images, tf.float32) # retyped_height = tf.cast(height,tf.int32) # retyped_width = tf.cast(width,tf.int32) # retyped_channel = tf.cast(channel,tf.int32) # labels = tf.cast(label,tf.int32) # decoded_images.set_shape([height,width,channel]) # resize =32 # reshaped_images=tf.reshape(decoded_images,[retyped_height,retyped_width ,retyped_channel]) # distored_image = tf.image.resize_images(decoded_images,[32,32],method=np.random.randint(4)) # distored_image = preprocess_for_train(reshaped_images,resize,resize,None) # images,labels= tf.train.shuffle_batch([distored_image,labels ],batch_size=batch_size,capacity=capacity,min_after_dequeue=500) # return images,labels ###tf.image.decode_jpeg############# image_raw = tf.image.decode_jpeg(image, channels=3) retyped_height = tf.cast(height, tf.int32) retyped_width = tf.cast(width, tf.int32) retyped_channel = tf.cast(channel, tf.int32) labels = tf.cast(label, tf.int32) # image_resize = tf.image.resize_images(image_raw,[32,32],method=np.random.randint(4)) image_resize = tf.image.resize_image_with_crop_or_pad( image_raw, reszie, reszie) images, labels = tf.train.shuffle_batch([image_resize, labels], batch_size=batch_size, capacity=capacity, min_after_dequeue=500) labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) labels = tf.sparse_to_dense(tf.concat(values=[indices, labels], axis=1), [batch_size, num_class], 1.0, 0.0) tf.summary.image('images', images) return images, labels
def build_model(self): video = tf.placeholder( tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image ]) # (batch, 80, 4096) video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step]) caption = tf.placeholder( tf.int32, [self.batch_size, self.n_caption_lstm_step + 1 ]) # enclude <BOS>; store word ID; (batch, max_length) caption_mask = tf.placeholder( tf.float32, [self.batch_size, self.n_caption_lstm_step + 1 ]) # (batch_size, max_length+1) video_flat = tf.reshape(video, [-1, self.dim_image]) image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b) # (batch_size*n_lstm_steps, dim_hidden) image_emb = tf.reshape( image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) print("lstm1 sate size,", self.lstm1.state_size) print("lstm2 sate size,", self.lstm2.state_size) # 2*hidden size state1 = tf.zeros([self.batch_size, self.lstm1.state_size]) # initial state state2 = tf.zeros([self.batch_size, self.lstm2.state_size]) # initial state padding = tf.zeros([self.batch_size, self.dim_hidden]) # (batch, 1000) probs = [] loss = 0.0 ############################## Encoding Stage ################################## context_padding = tf.zeros([self.batch_size, self.lstm2.state_size ]) #(batch_size, 2000) h_list = [] for i in range(0, self.n_video_lstm_step): # n_vedio_lstm_step = 80 with tf.variable_scope("LSTM1", reuse=(i != 0)): output1, state1 = self.lstm1(image_emb[:, i, :], state1) h_list.append(state1) with tf.variable_scope("LSTM2", reuse=(i != 0)): output2, state2 = self.lstm2( tf.concat([padding, output1, context_padding], 1), state2) print(np.shape(h_list)) h_list = tf.stack(h_list, axis=1) print(np.shape(h_list)) # (64, 80, 2000) ############################# Decoding Stage ###################################### for i in range(0, self.n_caption_lstm_step ): ## Phase 2 => only generate captions if i == 0: with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup( self.Wemb, caption[:, i]) else: # schedule sampling print(self.schedule_p) if (np.random.binomial( 1, self.schedule_p) == 1): # schedule_p 擲骰子值出來是1的機率 with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup( self.Wemb, caption[:, i]) else: max_prob_index = tf.argmax(logit_words, 1)[0] with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup( self.Wemb, max_prob_index) with tf.variable_scope("LSTM1", reuse=True): output1, state1 = self.lstm1(padding, state1) ##### attention #### context = [] if i == 0: new_z = self.attention_z # h_list_flat = tf.reshape(h_list,[-1,self.lstm1.state_size]) # print("h_list_flat shape, ", h_list_flat.shape) # 5120,2000 # for sample in range(0, self.batch_size): # alpha_list = [] # a list to store alpha"s" in each training sample # for step_ in range(0,self.n_video_lstm_step): # alpha =1 - tf.losses.cosine_distance(h_list[sample,step_,:], new_z[sample,:], dim=0) # alpha_list.append(alpha) # alpha_list = tf.expand_dims(alpha_list,1) # ci = tf.reduce_sum(tf.multiply(alpha_list, h_list[sample,:,:]),axis = 0) # context.append(ci) # context = tf.stack(context) # print("context shape", content.shape) h_list_flat = tf.reshape(h_list, [-1, self.lstm1.state_size]) htmp = tf.matmul( h_list_flat, self.attention_W) # for matmul operation (5120,2000) hW = tf.reshape(htmp, [ self.batch_size, self.n_video_lstm_step, self.lstm2.state_size ]) for x in range(0, self.batch_size): x_alpha = tf.reduce_sum(tf.multiply(hW[x, :, :], new_z[x, :]), axis=1) x_alpha = tf.nn.softmax(x_alpha) x_alpha = tf.expand_dims(x_alpha, 1) x_new_z = tf.reduce_sum(tf.multiply(x_alpha, h_list[x, :, :]), axis=0) context.append(x_new_z) context = tf.stack(context) print("context shape", context.shape) with tf.variable_scope("LSTM2", reuse=True): print(output1.shape) # (64,1000) output2, state2 = self.lstm2( tf.concat([current_embed, output1, context], 1), state2) new_z = state2 labels = tf.expand_dims(caption[:, i + 1], 1) # (batch, max_length, 1) indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) # (batch_size, 1) concated = tf.concat([indices, labels], 1) onehot_labels = tf.sparse_to_dense( concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) logit_words = tf.nn.xw_plus_b( output2, self.embed_word_W, self.embed_word_b) #probability of each word cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logit_words, labels=onehot_labels) cross_entropy = cross_entropy * caption_mask[:, i] probs.append(logit_words) current_loss = tf.reduce_sum(cross_entropy) / self.batch_size loss = loss + current_loss return loss, video, video_mask, caption, caption_mask, probs
def __init__(self, encoders, vocabulary, data_id, rnn_size, name, embedding_size=128, use_attention=None, max_output_len=20, scheduled_sampling=None, dropout_keep_p=0.5, copy_net=None, reused_word_embeddings=None, use_noisy_activations=False, depth=1): """A class that collects the part of the computation graph that is needed for decoding. TensorBoard summaries are collected in this class into the following collections: * 'summary_train' - collects statistics from the train-time * 'sumarry_val' - collects OAstatistics while being tested on the development data Arguments: encoders: List of encoders. If no encoder is provided, the decoder can be used to train a LM. vocabulary: Vocabulary used for decoding data_id: rnn_size: Size of the RNN state. embedding_size (int): Dimensionality of the word embeddings used during decoding. use_attention (str): The type of attention to use or None. (Refer to cli_options script for allowed types of attention] max_output_len (int): Maximum length of the decoder output. use_peepholes (bool): Flag whether peephole connections should be used in the GRU decoder. scheduled_sampling: Parameter k for inverse sigmoid decay in scheduled sampling. If set to None, linear combination of the decoded and supervised loss is used as a cost function. dropoout_keep_p: copy_net: Tuple of (i) list of indices to the target vocabulary (most likely input placeholders of a different encoder) and (ii) he tensor over which the copying will be done, and (iii) mask telling which words part of the input reused_word_embeddings: The decoder can be given the matrix of word embeddings from outside (if the vocabulary indexing is the same). If it is None, the decoder creates its own matrix of word embeddings. use_noisy_activations: If set to True, the deocder will use the GRU units with noisy activation. Attributes: inputs: List of placeholders for the decoder inputs. The i-th element of the list contains a batch of i-th symbols in the sequence. weights_ins: List of placeholders of particular output symbols weights. The i-th elements of the list contains a vector telling for each string of the batch how much the i-th word should. contirbute to the loss cumputation. In practice it contains 1's for words which are parts of the decoded strings and 0's for the padding. loss_with_gt_ins: Operator computing the sequence loss when the decoder always gets the ground truth input. loss_with_decoded_ins: Operator computing the sequence loss when the decoder receives previously computed outputs on its input. decoded_seq: List of batches of decoded words. (When the decoder is fed with its own outputs.) """ log("Initializing decoder, name: \"{}\"".format(name)) self.encoders = encoders assert_type(self, 'vocabulary', vocabulary, Vocabulary) self.vocabulary = vocabulary self.data_id = data_id self.rnn_size = rnn_size self.embedding_size = embedding_size self.use_attention = use_attention self.max_output_len = max_output_len self.scheduled_sampling = scheduled_sampling self.dropout_keep_p = dropout_keep_p self.copy_net = copy_net self.reused_word_embeddings = reused_word_embeddings self.use_noisy_activations = use_noisy_activations self.depth = depth self.name = name self.dropout_placeholder = tf.placeholder(tf.float32, name="decoder_dropout_plc") self.is_training = tf.placeholder(tf.bool, name="decoder_is_training") self.learning_step = tf.Variable(0, name="learning_step", trainable=False) ### tadyten nasledujici kus je rozhozeni podle poctu enkoderu ### kdyz je jeden, tak berem rovnou jeho zakodovanej stav ### kdyz je jich vic, tak je napred projektujem ### lepsi by bylo dat nepovinnej atribut encoder_projection a tomu ### dat jako hodnotu rnn_size. Ktera se odted bude inferovat automaticky ### = bez projekce se konkatenujou vystupni stavy vsech enkoderu ### a delka vyslednyho stavu bude rnn_size. if len(encoders) == 1 and ( rnn_size == encoders[0].encoded.get_shape()[1].value): encoded = encoders[0].encoded log("Using encoder output without projection.") elif len(encoders) >= 1: with tf.variable_scope("encoders_projection"): encoded_concat = tf.concat(1, [e.encoded for e in encoders]) concat_size = encoded_concat.get_shape()[1].value proj = tf.get_variable(name="project_encoders", shape=[concat_size, depth * rnn_size]) encoded_concat_dropped = tf.nn.dropout( encoded_concat, self.dropout_placeholder) proj_bias = tf.Variable(tf.zeros([depth * rnn_size])) encoded = tf.matmul(encoded_concat_dropped, proj) + proj_bias elif len(encoders) == 0: # if we want to train just LM encoded = tf.zeros([rnn_size]) log("No encoder - language model only.") ### TODO OTAZKA je, jestli to je ve spravnym poradi self.encoded = encoded encoded = tf.nn.dropout(encoded, self.dropout_placeholder) ### tenhle kus pode mnou je deklarovani placeholderu pro vstupy dekoderu ### placeholdery se vrazi do kolekce dec_endoder_ins, ktera ### se asi nikde nepouziva ### self.targets je self.gt_inputs posunuty o jedno doleva self.gt_inputs = [] with tf.variable_scope("decoder_inputs"): for i in range(max_output_len + 2): dec = tf.placeholder(tf.int64, [None], name='decoder{0}'.format(i)) tf.add_to_collection('dec_encoder_ins', dec) self.gt_inputs.append(dec) self.targets = self.gt_inputs[1:] ### tenhle kousek zadefinovava vahy na vstup. je jich tolik co ### targetu, a nejspis obsahujou jen jednicky a nuly podle toho, ### jestli uz jsme za koncem vstupni vety nebo ne. ### tohle by se melo s prechodem na dynamic rnn uplne vyhodit self.weights_ins = [] with tf.variable_scope("input_weights"): for _ in range(len(self.targets)): self.weights_ins.append(tf.placeholder(tf.float32, [None])) ### nasleduje kod samotnyho decoderu ve vlastnim scopu ### proc veci nade mnou jsou jinym vlastnim scopu, to nevim with tf.variable_scope('decoder'): ### deklarovani promennych pro vahy a biasy pro prechod ze ### stavu na vystupni vrstvu ### proc tady neni get_variable? to pouziva uniform unit scaling ### initializer, coz je prinejmensim vic cool nazev decoding_w = tf.Variable(tf.random_uniform( [rnn_size, len(vocabulary)], -0.5, 0.5), name="state_to_word_W") decoding_b = tf.Variable(tf.fill([len(vocabulary)], -math.log(len(vocabulary))), name="state_to_word_b") ### pokud nepouzivame sdileny embeddingy, vytvorime si vlastni ### to slouzi jako mapovani ze slovniku na vektor, kterej se dava ### na vstup dekoderu v kazdym time-stepu ### pro sdileni embeddingu je zapotrebi, aby mely stejnou velikost if reused_word_embeddings is None: decoding_em = tf.Variable(tf.random_uniform( [len(vocabulary), embedding_size], -0.5, 0.5), name="word_embeddings") else: decoding_em = reused_word_embeddings.word_embeddings ### vyrobime embeddovany ground-truth inputy a dropoutujem ### pouzivaj se pri trenovani embedded_gt_inputs = [ tf.nn.embedding_lookup(decoding_em, o) for o in self.gt_inputs[:-1] ] embedded_gt_inputs = [ tf.nn.dropout(i, self.dropout_placeholder) for i in embedded_gt_inputs ] ### zadefinujem funkci, ktera nam pro dany stav vrati logity ### tohle se bude muset predelat, je tu i ten copynet ### logity sou dropoutlej stav vynasobenej s vahovou matici ### vystupni a pricteny biasy def standard_logits(state): state = tf.nn.dropout(state, self.dropout_placeholder) return tf.matmul(state, decoding_w) + decoding_b, None logit_function = standard_logits ### COPY NET ### tomuhle se ted nebudu venovat if copy_net: # This is implementation of Copy-net # (http://arxiv.org/pdf/1603.06393v2.pdf) encoder_input_indices, copy_states, copy_mask = copy_net copy_tensor_dropped = tf.nn.dropout(copy_states, self.dropout_placeholder) copy_tensors = [ tf.squeeze(t, [1]) for t in tf.split(1, max_output_len + 2, copy_tensor_dropped) ] copy_features_size = copy_states.get_shape()[2].value # first we do the learned projection of the ecnoder outputs copy_w = tf.get_variable(name="copy_W", shape=[copy_features_size, rnn_size]) projected_inputs = tf.concat(1, [ tf.expand_dims(tf.matmul(c, copy_w), 1) for c in copy_tensors ]) batch_size = tf.shape(encoder_input_indices[0])[0] # tensor of batch numbers for indexing in a sparse vector batch_range = tf.range(start=0, limit=batch_size) batch_time_vocabulary_shape = tf.concat( 0, [ tf.expand_dims(batch_size, 0), tf.constant(len(vocabulary), shape=[1]) ]) ones = tf.ones(tf.expand_dims(batch_size, 0)) vocabulary_shaped_list = [] for slice_indices in encoder_input_indices: complete_indices = tf.concat(1, [ tf.expand_dims(batch_range, 1), tf.expand_dims(slice_indices, 1) ]) vocabulary_shaped = tf.sparse_to_dense( complete_indices, batch_time_vocabulary_shape, ones) vocabulary_shaped_list.append(vocabulary_shaped) vocabulary_shaped_indices = tf.concat( 1, [tf.expand_dims(v, 1) for v in vocabulary_shaped_list]) def copy_net_logit_function(state): state = tf.nn.dropout(state, self.dropout_placeholder) # the logits for generating the next word are computed in # the standard way generate_logits = tf.matmul(state, decoding_w) + decoding_b # Equation 8 in the paper ... in shape of source sentence # (batch x time) copy_logits_in_time = tf.reduce_sum( projected_inputs * tf.expand_dims(state, 1), [2]) # mask out the padding in exponential domain copy_logits_in_time_exp_masked = tf.exp( tf.minimum([[80.0]], copy_logits_in_time)) * copy_mask # ... in shape of vocabulary (batch x time x vocabulary) copy_logits_in_vocabulary = tf.expand_dims( copy_logits_in_time_exp_masked, 2) * vocabulary_shaped_indices # Equation 6 without normalization copy_logits_exp = tf.reduce_sum(copy_logits_in_vocabulary, [1]) logits_exp = copy_logits_exp \ + tf.exp(tf.minimum([[80.0]], generate_logits)) return (tf.log(tf.maximum([[1e-40]], logits_exp)), copy_logits_in_time) logit_function = copy_net_logit_function ### KONEC COPY-NETU ### Tohle pod nama jsou dve loop functions. Loop function je funkce ### ktera se pouziva za run-timu. Bere stav a cislo kroku v case ### a vraci vstup do dalsiho kroku po embeddovani a dropoutu def loop(prev_state, _): # it takes the previous hidden state, finds the word and formats # it as input for the next time step ... used in the decoder in # the "real decoding scenario" out_activation, _ = logit_function(prev_state) prev_word_index = tf.argmax(out_activation, 1) next_step_embedding = tf.nn.embedding_lookup( decoding_em, prev_word_index) return tf.nn.dropout(next_step_embedding, self.dropout_placeholder) ### tahle loop function je pro scheduled sampling ### scheduled sampling trenuje napred na zlatejch datech a postupem ### casu zvolna prepina na loop function. Tahle konkretne to dela ### pro kazdou trenovaci instanci v batchi zvlast. def sampling_loop(prev_state, i): """ Loop function performing the scheduled sampling (http://arxiv.org/pdf/1506.03099v3.pdf) with the inverse sigmoid decay. """ threshold = scheduled_sampling / (scheduled_sampling + tf.exp( tf.to_float(self.learning_step) / scheduled_sampling)) condition = tf.less_equal( tf.random_uniform(tf.shape(embedded_gt_inputs[0])), threshold) return tf.select(condition, embedded_gt_inputs[i], loop(prev_state, i)) gt_loop_function = sampling_loop if scheduled_sampling else None ### Tahle funkce tu strasi kvuli tomu, abychom mohli vybrat ### bunku, ktera se pouzije jako RNN cell. Jednak ty noisy ### activations nepomahaly a jednak bych to stejne cely vyhodil ### Dale tu je kod, kterej ty bunky vydropoutuje a udela z nich ### multirnncell (v pripade ze bychom chteli hlubsi rekurentni cast) def get_rnn_cell(): if use_noisy_activations: return NoisyGRUCell(rnn_size, training=self.is_training) else: return tf.nn.rnn_cell.GRUCell(rnn_size) decoder_cells = [get_rnn_cell()] for _ in range(1, depth): decoder_cells[-1] = tf.nn.rnn_cell.DropoutWrapper( decoder_cells[-1], output_keep_prob=self.dropout_placeholder) decoder_cells.append(get_rnn_cell()) decoder_cell = tf.nn.rnn_cell.MultiRNNCell(decoder_cells) ### A ted prichazi na radu attention. To se jen kouknem na encodery, ### jestli ho maji zadefinovanej nebo ne if use_attention: attention_objects = [ e.attention_object for e in encoders if e.attention_object ] else: attention_objects = [] ### A ted samotna dekodovaci procedura. Tahle prvni vraci vystupy ### s pouzitim zlatych vstupu (pri trenovani) rnn_outputs_gt_ins, _ = attention_decoder( embedded_gt_inputs, encoded, attention_objects, embedding_size, cell=decoder_cell, loop_function=gt_loop_function) tf.get_variable_scope().reuse_variables() ### Tady to dolejc je dekodovaci procedura pro run-time, takze ### s pouzitim loop functioně ### Proc je to placeholder? Proc to neni konstanta? self.go_symbols = tf.placeholder(tf.int32, shape=[None], name="decoder_go_symbols") decoder_inputs = [ tf.nn.embedding_lookup(decoding_em, self.go_symbols) ] decoder_inputs += [None for _ in range(self.max_output_len)] rnn_outputs_decoded_ins, _ = attention_decoder(decoder_inputs, encoded, attention_objects, embedding_size, cell=decoder_cell, loop_function=loop) self.hidden_states = rnn_outputs_decoded_ins ### KONEC decoder scope def get_decoded(rnn_outputs): logits = [] decoded = [] copynet_logits = [] for out in rnn_outputs: out_activation, logits_in_time = logit_function(out) if copy_net: copynet_logits.append(logits_in_time) logits.append(out_activation) decoded.append(tf.argmax(out_activation[:, 1:], 1) + 1) return decoded, logits, copynet_logits ### decoding a loss s ground truth (behem trenovani) _, self.gt_logits, _ = get_decoded(rnn_outputs_gt_ins) self.loss_with_gt_ins = tf.nn.seq2seq.sequence_loss( self.gt_logits, self.targets, self.weights_ins, len(vocabulary)) self.cost = self.loss_with_gt_ins ### decoding a loss s loop function (runtime) self.decoded_seq, self.decoded_logits, self.copynet_logits = \ get_decoded(rnn_outputs_decoded_ins) self.loss_with_decoded_ins = tf.nn.seq2seq.sequence_loss( self.decoded_logits, self.targets, self.weights_ins, len(vocabulary)) ### Tady pode mnou sou sumary. To je vsechno co se bude logovat do ### tensorboardu. tf.scalar_summary('train_loss_with_gt_intpus', self.loss_with_gt_ins, collections=["summary_train"]) tf.scalar_summary('train_loss_with_decoded_inputs', self.loss_with_decoded_ins, collections=["summary_train"]) tf.scalar_summary('train_optimization_cost', self.cost, collections=["summary_train"]) log("Decoder initalized.")
def get_dense_x(index, value): dense_x = tf.sparse_to_dense(tf.sparse_tensor_to_dense(index), [num_features], tf.sparse_tensor_to_dense(value)) return tf.reshape(dense_x, [num_features, 1])
def build_net(self, is_training=True): with self.graph.as_default(): if is_training: self.train_stage = tf.placeholder(tf.bool, shape=()) train_image, train_label, train_label_len = self.load_tfrecord( config.train_tfrecord) valid_image, valid_label, valid_label_len = self.load_tfrecord( config.valid_tfrecord) self.x = tf.cond(self.train_stage, lambda: train_image, lambda: valid_image) self.label = tf.cond(self.train_stage, lambda: train_label, lambda: valid_label) self.label_len = tf.cond(self.train_stage, lambda: train_label_len, lambda: valid_label_len) else: self.x = tf.placeholder(tf.float32, shape=(None, config.image_height, config.image_max_width, 1), name='image_batch') enc = self.base_net(is_training) print('enc1:', enc) tshape = enc.get_shape().as_list() final_width = tshape[1] * tshape[2] enc = tf.reshape(enc, [-1, final_width, config.rnn_units]) print('enc2:', enc) conv_mask = tf.sign(tf.abs(tf.reduce_sum(enc, -1))) conv_length = tf.reduce_sum(tf.cast(conv_mask, tf.int32), -1) for i in range(config.rnn_layers_num): _enc = tf.layers.dense(enc, config.rnn_units, use_bias=False) with tf.variable_scope("rnn_layer_{}".format(i)): cell_fw = tf.nn.rnn_cell.LSTMCell( num_units=config.rnn_units / 2, state_is_tuple=True) cell_bw = tf.nn.rnn_cell.LSTMCell( num_units=config.rnn_units / 2, state_is_tuple=True) enc, state = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=enc, dtype=tf.float32, time_major=False) enc = _enc + tf.concat(values=[enc[0], enc[1]], axis=-1) if is_training: enc = tf.layers.dropout(enc, 0.5) self.logits = tf.layers.dense(enc, len(self.idx2symbol) + 1) print('last logit shape', self.logits) logit_shape = self.logits.get_shape().as_list() time_major_logits = tf.transpose( self.logits, [1, 0, 2]) # max_time* batch_size * num_classes pmask = tf.sign(tf.abs(tf.reduce_sum(self.logits, -1))) seq_len = tf.fill([config.batch_size], logit_shape[1]) print('seq:', seq_len) greedy_preds = tf.nn.ctc_greedy_decoder(time_major_logits, seq_len) preds_sparse = tf.cast(greedy_preds[0][0], tf.int32) self.preds = tf.sparse_to_dense(preds_sparse.indices, preds_sparse.dense_shape, preds_sparse.values, name='pred') print('preds:', self.preds) if is_training: # label转sparse batch_label_length = config.label_max_len spare_tensor_indices = tf.where( tf.less(tf.cast(0, tf.int32), self.label)) # 返回大于0的indices print('label shape', self.label) spare_tensor_values = tf.reshape( self.label, [config.batch_size * batch_label_length]) mask = tf.cast(tf.less(tf.cast(0, tf.int32), spare_tensor_values), dtype=tf.bool) spare_tensor_values = tf.boolean_mask(spare_tensor_values, mask) labels_sparse = tf.SparseTensor( indices=spare_tensor_indices, values=spare_tensor_values, dense_shape=[config.batch_size, batch_label_length]) loss = tf.nn.ctc_loss(labels=labels_sparse, inputs=self.logits, sequence_length=seq_len, time_major=False) self.loss = tf.reduce_mean(loss) self.global_step = tf.Variable(0, trainable=False) #定义学习率和优化器 lr = config.learning_rate rate = tf.train.exponential_decay( lr, self.global_step, decay_steps=config.decay_steps, decay_rate=0.97, staircase=True) opt = tf.train.AdamOptimizer(learning_rate=rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.train_op = opt.minimize(self.loss, global_step=self.global_step) # accuracy self.edit_dist = tf.reduce_sum( tf.edit_distance(preds_sparse, labels_sparse, False)) self.char_count = tf.reduce_sum(self.label_len) tf.summary.scalar('loss', self.loss) self.merged_summary_op = tf.summary.merge_all()
def main(): # Get hyperparameters if FLAGS.enable_colored_log: import coloredlogs coloredlogs.install() logging.basicConfig(level=logging.INFO) INPUT_FILE_FORMAT = FLAGS.input_file_format if INPUT_FILE_FORMAT not in ["tfrecord", "csv"]: logging.error("Unknow input file format: {}".format(INPUT_FILE_FORMAT)) exit(1) FEATURE_SIZE = FLAGS.feature_size LABEL_SIZE = FLAGS.label_size EPOCH_NUMBER = FLAGS.epoch_number if EPOCH_NUMBER <= 0: EPOCH_NUMBER = None BATCH_THREAD_NUMBER = FLAGS.batch_thread_number MIN_AFTER_DEQUEUE = FLAGS.min_after_dequeue BATCH_CAPACITY = BATCH_THREAD_NUMBER * FLAGS.batch_size + MIN_AFTER_DEQUEUE MODE = FLAGS.mode MODEL = FLAGS.model CHECKPOINT_PATH = FLAGS.checkpoint_path if not CHECKPOINT_PATH.startswith("fds://") and not os.path.exists( CHECKPOINT_PATH): os.makedirs(CHECKPOINT_PATH) CHECKPOINT_FILE = CHECKPOINT_PATH + "/checkpoint.ckpt" LATEST_CHECKPOINT = tf.train.latest_checkpoint(CHECKPOINT_PATH) OUTPUT_PATH = FLAGS.output_path if not OUTPUT_PATH.startswith("fds://") and not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH) pprint.PrettyPrinter().pprint(FLAGS.__flags) # Process TFRecoreds files def read_and_decode_tfrecord(filename_queue): reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, features={ "label": tf.FixedLenFeature([], tf.float32), "features": tf.FixedLenFeature([FEATURE_SIZE], tf.float32), }) label = features["label"] features = features["features"] return label, features def read_and_decode_csv(filename_queue): # TODO: Not generic for all datasets reader = tf.TextLineReader() key, value = reader.read(filename_queue) # Default values, in case of empty columns. Also specifies the type of the # decoded result. #record_defaults = [[1], [1], [1], [1], [1]] record_defaults = [[1], [1.0], [1.0], [1.0], [1.0]] col1, col2, col3, col4, col5 = tf.decode_csv( value, record_defaults=record_defaults) label = col1 features = tf.stack([col2, col3, col4, col4]) return label, features # Read TFRecords files for training filename_queue = tf.train.string_input_producer( tf.train.match_filenames_once(FLAGS.train_file), num_epochs=EPOCH_NUMBER) if INPUT_FILE_FORMAT == "tfrecord": label, features = read_and_decode_tfrecord(filename_queue) elif INPUT_FILE_FORMAT == "csv": label, features = read_and_decode_csv(filename_queue) batch_labels, batch_features = tf.train.shuffle_batch( [label, features], batch_size=FLAGS.batch_size, num_threads=BATCH_THREAD_NUMBER, capacity=BATCH_CAPACITY, min_after_dequeue=MIN_AFTER_DEQUEUE) # Read TFRecords file for validatioin validate_filename_queue = tf.train.string_input_producer( tf.train.match_filenames_once(FLAGS.validate_file), num_epochs=EPOCH_NUMBER) if INPUT_FILE_FORMAT == "tfrecord": validate_label, validate_features = read_and_decode_tfrecord( validate_filename_queue) elif INPUT_FILE_FORMAT == "csv": validate_label, validate_features = read_and_decode_csv( validate_filename_queue) validate_batch_labels, validate_batch_features = tf.train.shuffle_batch( [validate_label, validate_features], batch_size=FLAGS.validate_batch_size, num_threads=BATCH_THREAD_NUMBER, capacity=BATCH_CAPACITY, min_after_dequeue=MIN_AFTER_DEQUEUE) # Define the model input_units = FEATURE_SIZE output_units = LABEL_SIZE model_network_hidden_units = [int(i) for i in FLAGS.model_network.split()] def full_connect(inputs, weights_shape, biases_shape, is_train=True): weights = tf.get_variable("weights", weights_shape, initializer=tf.random_normal_initializer()) biases = tf.get_variable("biases", biases_shape, initializer=tf.random_normal_initializer()) layer = tf.matmul(inputs, weights) + biases if FLAGS.enable_bn and is_train: mean, var = tf.nn.moments(layer, axes=[0]) scale = tf.get_variable("scale", biases_shape, initializer=tf.random_normal_initializer()) shift = tf.get_variable("shift", biases_shape, initializer=tf.random_normal_initializer()) layer = tf.nn.batch_normalization(layer, mean, var, shift, scale, FLAGS.bn_epsilon) return layer def full_connect_relu(inputs, weights_shape, biases_shape, is_train=True): layer = full_connect(inputs, weights_shape, biases_shape, is_train) layer = tf.nn.relu(layer) return layer def customized_inference(inputs, is_train=True): hidden1_units = 128 hidden2_units = 32 hidden3_units = 8 with tf.variable_scope("input"): layer = full_connect_relu(inputs, [input_units, hidden1_units], [hidden1_units], is_train) with tf.variable_scope("layer0"): layer = full_connect_relu(layer, [hidden1_units, hidden2_units], [hidden2_units], is_train) with tf.variable_scope("layer1"): layer = full_connect_relu(layer, [hidden2_units, hidden3_units], [hidden3_units], is_train) if FLAGS.enable_dropout and is_train: layer = tf.nn.dropout(layer, FLAGS.dropout_keep_prob) with tf.variable_scope("output"): layer = full_connect(layer, [hidden3_units, output_units], [output_units], is_train) return layer def dnn_inference(inputs, is_train=True): with tf.variable_scope("input"): layer = full_connect_relu(inputs, [input_units, model_network_hidden_units[0]], [model_network_hidden_units[0]], is_train) for i in range(len(model_network_hidden_units) - 1): with tf.variable_scope("layer{}".format(i)): layer = full_connect_relu( layer, [model_network_hidden_units[i], model_network_hidden_units[i + 1]], [model_network_hidden_units[i + 1]], is_train) with tf.variable_scope("output"): layer = full_connect(layer, [model_network_hidden_units[-1], output_units], [output_units], is_train) return layer def lr_inference(inputs, is_train=True): with tf.variable_scope("lr"): layer = full_connect(inputs, [input_units, output_units], [output_units]) return layer def wide_and_deep_inference(inputs, is_train=True): return lr_inference(inputs, is_train) + dnn_inference(inputs, is_train) def cnn_inference(inputs, is_train=True): # TODO: Change if validate_batch_size is different # [BATCH_SIZE, 512 * 512 * 1] -> [BATCH_SIZE, 512, 512, 1] inputs = tf.reshape(inputs, [FLAGS.batch_size, 512, 512, 1]) # [BATCH_SIZE, 512, 512, 1] -> [BATCH_SIZE, 128, 128, 8] with tf.variable_scope("conv0"): weights = tf.get_variable("weights", [3, 3, 1, 8], initializer=tf.random_normal_initializer()) bias = tf.get_variable("bias", [8], initializer=tf.random_normal_initializer()) layer = tf.nn.conv2d(inputs, weights, strides=[1, 1, 1, 1], padding="SAME") layer = tf.nn.bias_add(layer, bias) layer = tf.nn.relu(layer) layer = tf.nn.max_pool(layer, ksize=[1, 4, 4, 1], strides=[1, 4, 4, 1], padding="SAME") # [BATCH_SIZE, 128, 128, 8] -> [BATCH_SIZE, 32, 32, 8] with tf.variable_scope("conv1"): weights = tf.get_variable("weights", [3, 3, 8, 8], initializer=tf.random_normal_initializer()) bias = tf.get_variable("bias", [8], initializer=tf.random_normal_initializer()) layer = tf.nn.conv2d(layer, weights, strides=[1, 1, 1, 1], padding="SAME") layer = tf.nn.bias_add(layer, bias) layer = tf.nn.relu(layer) layer = tf.nn.max_pool(layer, ksize=[1, 4, 4, 1], strides=[1, 4, 4, 1], padding="SAME") # [BATCH_SIZE, 32, 32, 8] -> [BATCH_SIZE, 8, 8, 8] with tf.variable_scope("conv2"): weights = tf.get_variable("weights", [3, 3, 8, 8], initializer=tf.random_normal_initializer()) bias = tf.get_variable("bias", [8], initializer=tf.random_normal_initializer()) layer = tf.nn.conv2d(layer, weights, strides=[1, 1, 1, 1], padding="SAME") layer = tf.nn.bias_add(layer, bias) layer = tf.nn.relu(layer) layer = tf.nn.max_pool(layer, ksize=[1, 4, 4, 1], strides=[1, 4, 4, 1], padding="SAME") # [BATCH_SIZE, 8, 8, 8] -> [BATCH_SIZE, 8 * 8 * 8] layer = tf.reshape(layer, [-1, 8 * 8 * 8]) # [BATCH_SIZE, 8 * 8 * 8] -> [BATCH_SIZE, LABEL_SIZE] with tf.variable_scope("output"): weights = tf.get_variable("weights", [8 * 8 * 8, LABEL_SIZE], initializer=tf.random_normal_initializer()) bias = tf.get_variable("bias", [LABEL_SIZE], initializer=tf.random_normal_initializer()) layer = tf.add(tf.matmul(layer, weights), bias) return layer def inference(inputs, is_train=True): if MODEL == "dnn": return dnn_inference(inputs, is_train) elif MODEL == "lr": return lr_inference(inputs, is_train) elif MODEL == "wide_and_deep": return wide_and_deep_inference(inputs, is_train) elif MODEL == "customized": return customized_inference(inputs, is_train) elif MODEL == "cnn": return cnn_inference(inputs, is_train) else: logging.error("Unknown model, exit now") exit(1) logging.info("Use the model: {}, model network: {}".format( MODEL, FLAGS.model_network)) logits = inference(batch_features, True) batch_labels = tf.to_int64(batch_labels) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=batch_labels) loss = tf.reduce_mean(cross_entropy, name="loss") global_step = tf.Variable(0, name="global_step", trainable=False) if FLAGS.enable_lr_decay: logging.info("Enable learning rate decay rate: {}".format( FLAGS.lr_decay_rate)) starter_learning_rate = FLAGS.learning_rate learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, FLAGS.lr_decay_rate, staircase=True) else: learning_rate = FLAGS.learning_rate optimizer = get_optimizer(FLAGS.optimizer, learning_rate) train_op = optimizer.minimize(loss, global_step=global_step) tf.get_variable_scope().reuse_variables() # Define accuracy op for train data train_accuracy_logits = inference(batch_features, False) train_softmax = tf.nn.softmax(train_accuracy_logits) train_correct_prediction = tf.equal( tf.argmax(train_softmax, 1), batch_labels) train_accuracy = tf.reduce_mean(tf.cast(train_correct_prediction, tf.float32)) # Define auc op for train data batch_labels = tf.cast(batch_labels, tf.int32) sparse_labels = tf.reshape(batch_labels, [-1, 1]) derived_size = tf.shape(batch_labels)[0] indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) concated = tf.concat(axis=1, values=[indices, sparse_labels]) outshape = tf.stack([derived_size, LABEL_SIZE]) new_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0) _, train_auc = tf.contrib.metrics.streaming_auc(train_softmax, new_batch_labels) # Define accuracy op for validate data validate_accuracy_logits = inference(validate_batch_features, False) validate_softmax = tf.nn.softmax(validate_accuracy_logits) validate_batch_labels = tf.to_int64(validate_batch_labels) validate_correct_prediction = tf.equal( tf.argmax(validate_softmax, 1), validate_batch_labels) validate_accuracy = tf.reduce_mean(tf.cast(validate_correct_prediction, tf.float32)) # Define auc op for validate data validate_batch_labels = tf.cast(validate_batch_labels, tf.int32) sparse_labels = tf.reshape(validate_batch_labels, [-1, 1]) derived_size = tf.shape(validate_batch_labels)[0] indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) concated = tf.concat(axis=1, values=[indices, sparse_labels]) outshape = tf.stack([derived_size, LABEL_SIZE]) new_validate_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0) _, validate_auc = tf.contrib.metrics.streaming_auc(validate_softmax, new_validate_batch_labels) # Define inference op inference_features = tf.placeholder("float", [None, FEATURE_SIZE]) inference_logits = inference(inference_features, False) inference_softmax = tf.nn.softmax(inference_logits) inference_op = tf.argmax(inference_softmax, 1) keys_placeholder = tf.placeholder(tf.int32, shape=[None, 1]) keys = tf.identity(keys_placeholder) model_signature = { "inputs": exporter.generic_signature({"keys": keys_placeholder, "features": inference_features}), "outputs": exporter.generic_signature({"keys": keys, "softmax": inference_softmax, "prediction": inference_op}) } # Initialize saver and summary saver = tf.train.Saver() tf.summary.scalar("loss", loss) tf.summary.scalar("train_accuracy", train_accuracy) tf.summary.scalar("train_auc", train_auc) tf.summary.scalar("validate_accuracy", validate_accuracy) tf.summary.scalar("validate_auc", validate_auc) summary_op = tf.summary.merge_all() init_op = [tf.global_variables_initializer(), tf.local_variables_initializer()] # Create session to run with tf.Session() as sess: logging.info("Start to run with mode: {}".format(MODE)) writer = tf.summary.FileWriter(OUTPUT_PATH, sess.graph) sess.run(init_op) if MODE == "train": # Restore session and start queue runner restore_session_from_checkpoint(sess, saver, LATEST_CHECKPOINT) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) start_time = datetime.datetime.now() try: while not coord.should_stop(): _, loss_value, step = sess.run([train_op, loss, global_step]) # Print state while training if step % FLAGS.steps_to_validate == 0: train_accuracy_value, train_auc_value, validate_accuracy_value, validate_auc_value, summary_value = sess.run( [train_accuracy, train_auc, validate_accuracy, validate_auc, summary_op]) end_time = datetime.datetime.now() logging.info( "[{}] Step: {}, loss: {}, train_acc: {}, train_auc: {}, valid_acc: {}, valid_auc: {}".format( end_time - start_time, step, loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, validate_auc_value)) writer.add_summary(summary_value, step) saver.save(sess, CHECKPOINT_FILE, global_step=step) start_time = end_time except tf.errors.OutOfRangeError: # Export the model after training export_model(sess, saver, model_signature, FLAGS.model_path, FLAGS.model_version) finally: coord.request_stop() coord.join(threads) elif MODE == "export": if not restore_session_from_checkpoint(sess, saver, LATEST_CHECKPOINT): logging.error("No checkpoint found, exit now") exit(1) # Export the model export_model(sess, saver, model_signature, FLAGS.model_path, FLAGS.model_version) elif MODE == "savedmodel": if not restore_session_from_checkpoint(sess, saver, LATEST_CHECKPOINT): logging.error("No checkpoint found, exit now") exit(1) logging.info("Export the saved model to {}".format( FLAGS.saved_model_path)) export_path_base = FLAGS.saved_model_path export_path = os.path.join( compat.as_bytes(export_path_base), compat.as_bytes(str(FLAGS.model_version))) model_signature = signature_def_utils.build_signature_def( inputs={ "keys": utils.build_tensor_info(keys_placeholder), "features": utils.build_tensor_info(inference_features) }, outputs={ "keys": utils.build_tensor_info(keys), "softmax": utils.build_tensor_info(inference_softmax), "prediction": utils.build_tensor_info(inference_op) }, method_name=signature_constants.PREDICT_METHOD_NAME) try: builder = saved_model_builder.SavedModelBuilder(export_path) builder.add_meta_graph_and_variables( sess, [tag_constants.SERVING], clear_devices=True, signature_def_map={ signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: model_signature, }, #legacy_init_op=legacy_init_op) legacy_init_op=tf.group(tf.initialize_all_tables(), name="legacy_init_op")) builder.save() except Exception as e: logging.error("Fail to export saved model, exception: {}".format(e)) elif MODE == "inference": if not restore_session_from_checkpoint(sess, saver, LATEST_CHECKPOINT): logging.error("No checkpoint found, exit now") exit(1) # Load inference test data inference_result_file_name = FLAGS.inference_result_file inference_test_file_name = FLAGS.inference_test_file inference_data = np.genfromtxt(inference_test_file_name, delimiter=",") inference_data_features = inference_data[:, 0:9] inference_data_labels = inference_data[:, 9] # Run inference start_time = datetime.datetime.now() prediction, prediction_softmax = sess.run( [inference_op, inference_softmax], feed_dict={inference_features: inference_data_features}) end_time = datetime.datetime.now() # Compute accuracy label_number = len(inference_data_labels) correct_label_number = 0 for i in range(label_number): if inference_data_labels[i] == prediction[i]: correct_label_number += 1 accuracy = float(correct_label_number) / label_number # Compute auc y_true = np.array(inference_data_labels) y_score = prediction_softmax[:, 1] fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=1) auc = metrics.auc(fpr, tpr) logging.info("[{}] Inference accuracy: {}, auc: {}".format( end_time - start_time, accuracy, auc)) # Save result into the file np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",") logging.info("Save result to file: {}".format( inference_result_file_name))
def build_model(self): # for every video in the batch(50), there are n_video_lstm_step(80) represented by a vector of length 1000 video = tf.placeholder( tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image], name="video") # 1 - for video input and 0 - for no video input video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step], name="video_mask") # placeholder that holds the captions caption = tf.placeholder( tf.int32, [self.batch_size, self.n_caption_lstm_step + 1], name="caption") # caption word present - 1 not present - 0 caption_mask = tf.placeholder( tf.float32, [self.batch_size, self.n_caption_lstm_step + 1], name="caption_mask") # flatten the video placeholder shape(50,80,4096) to (4000,4096) shape video_flat = tf.reshape(video, [-1, self.dim_image]) # do the matrix multiplication operation and addition of biases # encode_image_W has dimension = (4096,1000) # encode_image_b has dimension = (1000) # video_flat has shape = (4000, 4096) # obtained dimension = (4000, 1000) image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b) # reshape from (4000, 1000) back to (50, 80, 1000) image_emb = tf.reshape( image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) state1 = tf.zeros([self.batch_size, self.lstm1.state_size]) state2 = tf.zeros([self.batch_size, self.lstm2.state_size]) padding = tf.zeros([self.batch_size, self.dim_hidden]) probs = [] loss = 0.0 lbls = [] predictions = [] # encoding phase for i in range(0, self.n_video_lstm_step): if i > 0: tf.get_variable_scope().reuse_variables() # get the state (50,2000) and output(50,1000) from the lstm1 and use it over the timestpes with tf.variable_scope("LSTM1"): output1, state1 = self.lstm1(image_emb[:, i, :], state1) # As per the paper zeroes are padded to the output of the lstm1 and the fed into the lstm2 # dimension of output1 = (50, 1000) for ith step # dimension of padding = (50, 1000) # after concatenation dimension becomes = (50, 2000) with tf.variable_scope("LSTM2"): output2, state2 = self.lstm2(tf.concat([padding, output1], 1), state2) # output2 dimension = (50, 1000) for ith step # decoding step print "---- decoding ----" for i in range(0, self.n_caption_lstm_step): #with tf.device("/gpu:2"): # looks up the embedding for all the words of all the batches for the current lstm step tf.get_variable_scope().reuse_variables() current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i]) with tf.variable_scope("LSTM1"): output1, state1 = self.lstm1(padding, state1) with tf.variable_scope("LSTM2"): output2, state2 = self.lstm2( tf.concat([current_embed, output1], 1), state2) # for the ith timestep get all the caption placeholders # labels = tensor of shape (50,1) labels = tf.expand_dims(caption[:, i + 1], 1) # generate an indexing from 0 to batchsize-1 # tf.range(start, limit, delta) just like np.arange() # labels = tensor of shape (50,1) indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) # concat both these to get a tensor of shape (50,2) # concated stores the complete index where 1 should be placed, on all other places 0s are placed concated = tf.concat([indices, labels], 1) # onehot encoding for the words - dimension is (50, vocabulary) onehot_labels = tf.sparse_to_dense( concated, tf.stack([self.batch_size, self.vocabulary]), 1.0, 0.0) # logit_words has dimension (50, vocabulary) logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b) # calculate the cross-entropy loss of the logits with the actual labels cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logit_words, labels=onehot_labels) # find cross_entropy loss only where mask = 1 cross_entropy = cross_entropy * caption_mask[:, i] # store the probabilities probs.append(logit_words) lbls.append(onehot_labels) current_loss = tf.reduce_sum(cross_entropy) / self.batch_size loss = loss + current_loss predictions.append(tf.nn.softmax(logit_words)) return loss, video, video_mask, caption, caption_mask, probs, predictions, lbls
def get_datapoint_iter(file_idx=[], batch_size=s_batch): fileNames = map(lambda s: "/home/ubuntu/criteo-tfr-tiny/tfrecords" + s, file_idx) # We first define a filename queue comprising 5 files. filename_queue = tf.train.string_input_producer(fileNames, num_epochs=None) # TFRecordReader creates an operator in the graph that reads data from queue reader = tf.TFRecordReader() # Include a read operator with the filenae queue to use. The output is a string # Tensor called serialized_example _, serialized_example = reader.read(filename_queue) # The string tensors is essentially a Protobuf serialized string. With the # following fields: label, index, value. We provide the protobuf fields we are # interested in to parse the data. Note, feature here is a dict of tensors features = tf.parse_single_example( serialized_example, features={ 'label': tf.FixedLenFeature([1], dtype=tf.int64), 'index': tf.VarLenFeature(dtype=tf.int64), 'value': tf.VarLenFeature(dtype=tf.float32), }) label = features['label'] index = features['index'] value = features['value'] # These print statements are there for you see the type of the following # variables print label print index print value # since we parsed a VarLenFeatures, they are returned as SparseTensors. # To run operations on then, we first convert them to dense Tensors as below. dense_feature = tf.sparse_to_dense( tf.sparse_tensor_to_dense(index), [ 33762578, ], # tf.constant([33762578, 1], dtype=tf.int64), tf.sparse_tensor_to_dense(value)) label_flt = tf.cast(label, tf.float32) # min_after_dequeue defines how big a buffer we will randomly sample # from -- bigger means better shuffling but slower start up and more # memory used. # capacity must be larger than min_after_dequeue and the amount larger # determines the maximum we will prefetch. Recommendation: # min_after_dequeue + (num_threads + a small safety margin) * batch_size min_after_dequeue = 10 capacity = min_after_dequeue + 3 * batch_size example_batch, label_batch = tf.train.shuffle_batch( [dense_feature[0:num_features], label_flt], batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue) return example_batch, label_batch
def build_model(self, video, video_mask, caption, caption_mask, drop_sent='keep', drop_video='keep', weight_cap=1., weight_rbm=0.001, weight_vid=1.): video_mask = tf.cast(video_mask, tf.float32) caption_mask = tf.cast(caption_mask, tf.float32) assert drop_sent in ['totally', 'random', 'keep'] assert drop_video in ['totally', 'random', 'keep'] video_flat = tf.reshape(video, [-1, self.dim_image]) # (b x nv) x d image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b) # (b x nv) x h image_emb = tf.reshape( image_emb, [self.batch_size, self.n_video_steps, self.dim_hidden ]) # b x nv x h image_emb = tf.transpose(image_emb, [1, 0, 2]) # n x b x h c_init = tf.zeros([self.batch_size, self.dim_hidden]) # b x h m_init = tf.zeros([self.batch_size, self.dim_hidden]) # b x h state1 = (c_init, m_init) c2 = tf.zeros([self.batch_size, self.dim_hidden]) # b x h m2 = tf.zeros([self.batch_size, self.dim_hidden]) # b x h state2 = (c_init, m_init) ######## Encoding Stage ######### # encoding video output1 = tf.reduce_mean(image_emb, axis=0) with tf.variable_scope("model") as scope: # encoding sentence for i in xrange(self.n_caption_steps): if i > 0: scope.reuse_variables() with tf.variable_scope("LSTM2"): with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup( self.Wemb, caption[:, i]) output2, state2 = self.lstm2_dropout( current_embed, state2) # b x h ######## Encoding Stage ######### ######## Dropout Stage ######### if drop_sent == 'totally': output2 = tf.constant(0) * output2 output2 = tf.stop_gradient(output2) elif drop_sent == 'random': coeff = tf.floor(tf.random_uniform([1], 0, 1) + 0.5) output2 = coeff * output2 if drop_video == 'totally': output1 = tf.constant(0) * output1 output1 = tf.stop_gradient(output1) elif drop_video == 'random': coeff = tf.floor(tf.random_uniform([1], 0, 1) + 0.5) output1 = coeff * output1 ######## Dropout Stage ######### ######## Semantic Learning Stage ######## input_state = tf.concat([output1, output2], 1) # b x (2 * h) loss_rbm, output_semantic = self.rbm(input_state) ######## Semantic Learning Stage ######## ######## Decoding Stage ########## state3 = (c_init, m_init) state4 = (c_init, m_init) video_prev = tf.zeros([self.batch_size, self.dim_image]) # b x d_im sent_prev = tf.zeros([self.batch_size, self.dim_hidden]) # b x h current_embed = tf.zeros([self.batch_size, self.dim_hidden]) # b x h loss_caption = 0.0 loss_video = 0.0 image_part = tf.reshape(image_emb, [-1, self.dim_hidden]) image_part = tf.matmul(image_part, self.embed_att_Ua) + self.embed_att_ba image_part = tf.reshape( image_part, [self.n_video_steps, self.batch_size, self.dim_hidden]) ## decoding sentence with attention with tf.variable_scope("model") as scope: # first write semantic into memory with tf.variable_scope("LSTM3"): _, state3 = self.lstm3_dropout( tf.concat([output_semantic, output_semantic], 1), state3) for i in xrange(n_caption_steps): e = tf.tanh( tf.matmul(sent_prev, self.embed_att_Wa) + image_part) # n x b x h e = tf.reshape(e, [-1, self.dim_hidden]) e = tf.matmul(e, self.embed_att_w) # n x b e = tf.reshape(e, [self.n_video_steps, self.batch_size]) # e = tf.reduce_sum(e,2) # n x b e_hat_exp = tf.multiply(tf.transpose(video_mask), tf.exp(e)) # n x b denomin = tf.reduce_sum(e_hat_exp, 0) # b denomin = denomin + tf.to_float(tf.equal( denomin, 0)) # regularize denominator alphas = tf.tile(tf.expand_dims(tf.div(e_hat_exp, denomin), 2), [1, 1, self.dim_hidden ]) # n x b x h # normalize to obtain alpha attention_list = tf.multiply(alphas, image_emb) # n x b x h atten = tf.reduce_sum( attention_list, 0) # b x h # soft-attention weighted sum if i > 0: scope.reuse_variables() with tf.variable_scope("LSTM3"): output3, state3 = self.lstm3_dropout( tf.concat([atten, current_embed], 1), state3) # b x h output3_2 = tf.tanh( tf.nn.xw_plus_b( tf.concat([output3, atten, current_embed], 1), self.embed_nn_Wp, self.embed_nn_bp)) # b x h sent_prev = output3 # b x h labels = tf.expand_dims(caption[:, i], 1) # b x 1 indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) # b x 1 concated = tf.concat([indices, labels], 1) # b x 2 onehot_labels = tf.sparse_to_dense( concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) # b x w with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup( self.Wemb, caption[:, i]) logit_words = tf.nn.xw_plus_b(output3_2, self.embed_word_W, self.embed_word_b) # b x w cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logit_words, labels=onehot_labels) # b x 1 cross_entropy = cross_entropy * caption_mask[:, i] # b x 1 loss_caption += tf.reduce_sum(cross_entropy) # 1 ## decoding video with attention with tf.variable_scope("model") as scope: # first write semantic into memory with tf.variable_scope("LSTM4"): _, state4 = self.lstm4_dropout(output_semantic, state4) ## TODO: add attention for video decoding scope.reuse_variables() for i in xrange(n_video_steps): with tf.variable_scope("LSTM4"): output4, state4 = self.lstm4_dropout( image_emb[i, :, :], state4) video_prev = tf.nn.xw_plus_b(output4, self.decode_image_W, self.decode_image_b) # b x d_im euclid_loss = tf.reduce_sum(tf.square( tf.subtract(video_prev, video[:, i, :])), axis=1, keep_dims=True) # b x 1 euclid_loss = euclid_loss * video_mask[:, i] # b x 1 loss_video += tf.reduce_sum(euclid_loss) # 1 loss_caption = loss_caption / tf.reduce_sum(caption_mask) loss_video = loss_video / tf.reduce_sum(video_mask) loss = weight_cap * loss_caption + weight_rbm * loss_rbm + weight_vid * loss_video return loss, loss_caption, loss_rbm, loss_video, output_semantic
def run_training(): with tf.Graph().as_default(): # Extract data from tfrecords # data_type_placeholder = tf.placeholder("string") data_set_type = ['train', 'validation', 'test'] data_sets = [] for i in range(3): images, labels, rows = ctt.inputs(data_set_type=data_set_type[i], batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) images = tf.sparse_to_dense(images.indices, images.shape, images.values) data_sets.append([images, labels, rows]) # LSTM inputs_placeholder = tf.placeholder("float32", [FLAGS.batch_size, None, ctt.FLAGS.feature_col]) rows_placeholder = tf.placeholder("float32", [FLAGS.batch_size]) labels_placeholder = tf.placeholder("int32", [FLAGS.batch_size, None]) max_time_placeholder = tf.placeholder("float32", []) lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, forget_bias=1.0, state_is_tuple=False) cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers, state_is_tuple=False) initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32) outputs, output_states = tf.nn.dynamic_rnn(cell=cell, inputs=inputs_placeholder, sequence_length=rows_placeholder, initial_state=initial_state, dtype=tf.float32, swap_memory=False, time_major=False, scope=None) # softmax_w , shape=[num_units, num_class] # softmax_w = tf.get_variable("softmax_w", [num_units, num_class], dtype=tf.float32) # softmax_w = tf.Variable(tf.truncated_normal([num_units, num_class], # stddev=1.0 / math.sqrt(float(num_units))), # name='output_weights') softmax_w = tf.Variable(tf.zeros([num_units, num_class]), name='output_weights') softmax_b = tf.Variable(tf.zeros([num_class]), name='output_biases') # softmax_b = tf.get_variable("softmax_b", [num_class], dtype=tf.float32) # extra_thing padding_vec = tf.zeros([1, FLAGS.batch_size], dtype=tf.float32) padding_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(padding_vec, [0]) one_example_loss = 0.0 for batch in range(FLAGS.batch_size): # for batch in range(5): # output Layer logits = tf.matmul(outputs[batch, :, :], softmax_w) + softmax_b # Add to the Graph the loss calculation. cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, labels_placeholder[batch, :], name='xentropy') pad_loss_sum = padding_loss * (max_time_placeholder - rows_placeholder[batch]+1) one_example_loss += (tf.reduce_sum(cross_entropy, name='slot_loss') - pad_loss_sum) / rows_placeholder[batch] loss = one_example_loss / FLAGS.batch_size # logits = tf.matmul(outputs[:, -1, :], softmax_w) + softmax_b # # # Add to the Graph the loss calculation. # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # logits, labels_placeholder, name='xentropy') # loss = tf.reduce_mean(cross_entropy, name='slot_loss') # Add a scalar summary for the snapshot loss. # tf.scalar_summary(loss.op.name, loss) # Add to the Graph operations that train the model. optimizer = tf.train.AdamOptimizer(learning_rate) # Create a variable to track the global step. global_step = tf.Variable(0, name='global_step', trainable=False) # Use the optimizer to apply the gradients that minimize the loss # (and also increment the global step counter) as a single training step. train_op = optimizer.minimize(loss, global_step=global_step) # evaluation correct = tf.nn.in_top_k(logits, labels, 1) eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) # summary = tf.merge_all_summaries() # The op for initializing the variables. init_op = tf.group(tf.initialize_all_variables(), tf.initialize_local_variables()) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # Initialize the variables (the trained variables and the # epoch counter). # Instantiate a SummaryWriter to output summaries and the Graph. # summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) sess.run(init_op) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: step = 0 while not coord.should_stop(): start_time = time.time() # Run one step of the model. The return values are # the activations from the `train_op` (which is # discarded) and the `loss` op. To inspect the values # of your ops or variables, you may include them in # the list passed to sess.run() and the value tensors # will be returned in the tuple from the call. # feature_data, labels_data, sequence_len = sess.run([images, labels, rows], # feed_dict={data_type_placeholder: 'train'}) feature_data, labels_data, sequence_len = sess.run(data_sets[0]) # labels_data = tf.matmul(labels_data, np.ones(max(sequence_len), dtype=tf.float64)) labels_data = np.array(labels_data).reshape(1, FLAGS.batch_size) labels_data = labels_data.repeat(max(sequence_len), axis=0).transpose() for index in range(FLAGS.batch_size): labels_data[index, sequence_len[index]:] = 0 # _, loss_value, outputs_value, \ # softmax_w_value = sess.run([train_op, loss, outputs, softmax_w], # feed_dict={inputs_placeholder: feature_data, # rows_placeholder: sequence_len, # labels_placeholder: labels_data}) _, loss_value = sess.run([train_op, loss], feed_dict={inputs_placeholder: feature_data, rows_placeholder: sequence_len, labels_placeholder: labels_data, max_time_placeholder: max(sequence_len)}) # data_sets_value = sess.run(data_sets) # _, loss_value = sess.run([train_op, loss], # feed_dict={inputs_placeholder: data_sets_value.data, # rows_placeholder: data_sets_value.rows, # labels_placeholder: data_sets_value.target}) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' duration = time.time() - start_time # Write the summaries and print an overview fairly often. # if (step % 10 == 0) or (step > 80): # if step < 10: if step % 10 == 0: # Print status to stdout. # print labels_data, sequence_len print('Step %d: loss = %.2f(%.3f sec)' % (step, loss_value, duration)) # print(outputs_value, softmax_w_value) # summary_writer.add_summary(summary) # summary_writer.flush() # print('Validation Data Eval:') # do_eval(sess, # eval_correct, # inputs_placeholder, # rows_placeholder, # labels_placeholder, # data_sets, # 1) step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close()
def RPNTarget(all_anchors, num_anchors, gt_boxes, im_shape): """RPNTarget: Get RPN's classification and regression targets. RPNTarget is responsible for: * calculating the correct values for both classification and regression problems. * defining which anchors and target values are going to be used for the RPN minibatch. For calculating the correct values for classification (ie. the question of "does this anchor refer to an object?") and returning an objectness score, we calculate the intersection over union (IoU) between the anchors boxes and the ground truth boxes, and use this to categorize anchors. When the intersection between anchors and groundtruth is above a threshold, we can mark the anchor as an object or as being foreground. In case of not having any intersection or having a low IoU value, then we say that the anchor refers to background. For calculating the correct values for the regression, the problem of transforming the fixed size anchor into a more suitable bounding box (equal to the ground truth box) only applies to the anchors that we consider to be foreground. RPNTarget is also responsible for selecting which of the anchors are going to be used for the minibatch. This is a random process with some restrictions on the ratio between foreground and background samples. For selecting the minibatch, labels are not only set to 0 or 1 (for the cases of being background and foreground respectively), but also to -1 for the anchors we just want to ignore and not include in the minibatch. In summary: * 1 is positive when GT overlap is >= 0.7 (configurable) or for GT max overlap (one anchor) * 0 is negative when GT overlap is < 0.3 (configurable) * -1 is don't care useful for subsampling negative labels Returns: labels: label for each anchor bbox_targets: bbox regresion values for each anchor """ allowed_border = 0 # We set clobber positive to False to make sure that there is always at # least one positive anchor per GT box. clobber_positives = False # We set anchors as positive when the IoU is greater than # `positive_overlap`. positive_overlap = 0.7 # We set anchors as negative when the IoU is less than # `negative_overlap`. negative_overlap = 0.3 # Fraction of the batch to be foreground labeled anchors. foreground_fraction = 0.5 minibatch_size = 256 # When choosing random targets use `seed` to replicate behaviour. seed = None """ We compare anchors to GT and using the minibatch size and the different config settings (clobber, foreground fraction, etc), we end up with training targets *only* for the elements we want to use in the batch, while everything else is ignored. Basically what it does is, first generate the targets for all (valid) anchors, and then start subsampling the positive (foreground) and the negative ones (background) based on the number of samples of each type that we want. Args: all_anchors: A Tensor with all the bounding boxes coords of the anchors. Its shape should be (num_anchors, 4). gt_boxes: A Tensor with the ground truth bounding boxes of the image of the batch being processed. Its shape should be (num_gt, 5). The last dimension is used for the label. im_shape: Shape of original image (height, width) in order to define anchor targers in respect with gt_boxes. Returns: Tuple of the tensors of: labels: (1, 0, -1) for each anchor. Shape (num_anchors, 1) bbox_targets: 4d bbox targets as specified by paper. Shape (num_anchors, 4) max_overlaps: Max IoU overlap with ground truth boxes. Shape (num_anchors, 1) """ # Keep only the coordinates of gt_boxes gt_boxes = gt_boxes[:, :4] all_anchors = all_anchors[:, :4] # Only keep anchors inside the image (x_min_anchor, y_min_anchor, x_max_anchor, y_max_anchor) = tf.unstack(all_anchors, axis=1) anchor_filter = tf.logical_and( tf.logical_and(tf.greater_equal(x_min_anchor, -allowed_border), tf.greater_equal(y_min_anchor, -allowed_border)), tf.logical_and(tf.less(x_max_anchor, im_shape[1] + allowed_border), tf.less(y_max_anchor, im_shape[0] + allowed_border))) # We (force) reshape the filter so that we can use it as a boolean mask anchor_filter = tf.reshape(anchor_filter, [-1]) # Filter anchors. anchors = tf.boolean_mask(all_anchors, anchor_filter, name='filter_anchors') # Generate array with the labels for all_anchors. labels = tf.fill((tf.gather(tf.shape(all_anchors), [0])), -1) labels = tf.boolean_mask(labels, anchor_filter, name='filter_labels') # Intersection over union (IoU) overlap between the anchors and the # ground truth boxes. overlaps = bbox_overlap_tf(tf.to_float(anchors), tf.to_float(gt_boxes)) # Generate array with the IoU value of the closest GT box for each # anchor. max_overlaps = tf.reduce_max(overlaps, axis=1) if not clobber_positives: # Assign bg labels first so that positive labels can clobber them. # First we get an array with True where IoU is less than # negative_overlap negative_overlap_nonzero = tf.less(max_overlaps, negative_overlap) # Finally we set 0 at True indices labels = tf.where(condition=negative_overlap_nonzero, x=tf.zeros(tf.shape(labels)), y=tf.to_float(labels)) # Get the value of the max IoU for the closest anchor for each gt. gt_max_overlaps = tf.reduce_max(overlaps, axis=0) # Find all the indices that match (at least one, but could be more). gt_argmax_overlaps = tf.squeeze(tf.equal(overlaps, gt_max_overlaps)) gt_argmax_overlaps = tf.where(gt_argmax_overlaps)[:, 0] # Eliminate duplicates indices. gt_argmax_overlaps, _ = tf.unique(gt_argmax_overlaps) # Order the indices for sparse_to_dense compatibility gt_argmax_overlaps, _ = tf.nn.top_k(gt_argmax_overlaps, k=tf.shape(gt_argmax_overlaps)[-1]) gt_argmax_overlaps = tf.reverse(gt_argmax_overlaps, [0]) # Foreground label: for each ground-truth, anchor with highest overlap. # When the argmax is many items we use all of them (for consistency). # We set 1 at gt_argmax_overlaps_cond indices gt_argmax_overlaps_cond = tf.sparse_to_dense(gt_argmax_overlaps, tf.shape(labels, out_type=tf.int64), True, default_value=False) labels = tf.where(condition=gt_argmax_overlaps_cond, x=tf.ones(tf.shape(labels)), y=tf.to_float(labels)) # Foreground label: above threshold Intersection over Union (IoU) # First we get an array with True where IoU is greater or equal than # positive_overlap positive_overlap_inds = tf.greater_equal(max_overlaps, positive_overlap) # Finally we set 1 at True indices labels = tf.where(condition=positive_overlap_inds, x=tf.ones(tf.shape(labels)), y=labels) if clobber_positives: # Assign background labels last so that negative labels can clobber # positives. First we get an array with True where IoU is less than # negative_overlap negative_overlap_nonzero = tf.less(max_overlaps, negative_overlap) # Finally we set 0 at True indices labels = tf.where(condition=negative_overlap_nonzero, x=tf.zeros(tf.shape(labels)), y=labels) # Subsample positive labels if we have too many def subsample_positive(): # Shuffle the foreground indices disable_fg_inds = tf.random_shuffle(fg_inds, seed=seed) # Select the indices that we have to ignore, this is # `tf.shape(fg_inds)[0] - num_fg` because we want to get only # `num_fg` foreground labels. disable_place = (tf.shape(fg_inds)[0] - num_fg) disable_fg_inds = disable_fg_inds[:disable_place] # Order the indices for sparse_to_dense compatibility disable_fg_inds, _ = tf.nn.top_k(disable_fg_inds, k=tf.shape(disable_fg_inds)[-1]) disable_fg_inds = tf.reverse(disable_fg_inds, [0]) disable_fg_inds = tf.sparse_to_dense(disable_fg_inds, tf.shape(labels, out_type=tf.int64), True, default_value=False) # Put -1 to ignore the anchors in the selected indices return tf.where(condition=tf.squeeze(disable_fg_inds), x=tf.to_float(tf.fill(tf.shape(labels), -1)), y=labels) num_fg = tf.to_int32(foreground_fraction * minibatch_size) # Get foreground indices, get True in the indices where we have a one. fg_inds = tf.equal(labels, 1) # We get only the indices where we have True. fg_inds = tf.squeeze(tf.where(fg_inds), axis=1) fg_inds_size = tf.size(fg_inds) # Condition for check if we have too many positive labels. subsample_positive_cond = fg_inds_size > num_fg # Check the condition and subsample positive labels. labels = tf.cond(subsample_positive_cond, true_fn=subsample_positive, false_fn=lambda: labels) # Subsample negative labels if we have too many def subsample_negative(): # Shuffle the background indices disable_bg_inds = tf.random_shuffle(bg_inds, seed=seed) # Select the indices that we have to ignore, this is # `tf.shape(bg_inds)[0] - num_bg` because we want to get only # `num_bg` background labels. disable_place = (tf.shape(bg_inds)[0] - num_bg) disable_bg_inds = disable_bg_inds[:disable_place] # Order the indices for sparse_to_dense compatibility disable_bg_inds, _ = tf.nn.top_k(disable_bg_inds, k=tf.shape(disable_bg_inds)[-1]) disable_bg_inds = tf.reverse(disable_bg_inds, [0]) disable_bg_inds = tf.sparse_to_dense(disable_bg_inds, tf.shape(labels, out_type=tf.int64), True, default_value=False) # Put -1 to ignore the anchors in the selected indices return tf.where(condition=tf.squeeze(disable_bg_inds), x=tf.to_float(tf.fill(tf.shape(labels), -1)), y=labels) # Recalculate the foreground indices after (maybe) disable some of them # Get foreground indices, get True in the indices where we have a one. fg_inds = tf.equal(labels, 1) # We get only the indices where we have True. fg_inds = tf.squeeze(tf.where(fg_inds), axis=1) fg_inds_size = tf.size(fg_inds) num_bg = tf.to_int32(minibatch_size - fg_inds_size) # Get background indices, get True in the indices where we have a zero. bg_inds = tf.equal(labels, 0) # We get only the indices where we have True. bg_inds = tf.squeeze(tf.where(bg_inds), axis=1) bg_inds_size = tf.size(bg_inds) # Condition for check if we have too many positive labels. subsample_negative_cond = bg_inds_size > num_bg # Check the condition and subsample positive labels. labels = tf.cond(subsample_negative_cond, true_fn=subsample_negative, false_fn=lambda: labels) # Return bbox targets with shape (anchors.shape[0], 4). # Find the closest gt box for each anchor. argmax_overlaps = tf.argmax(overlaps, axis=1) # Eliminate duplicates. argmax_overlaps_unique, _ = tf.unique(argmax_overlaps) # Filter the gt_boxes. # We get only the indices where we have "inside anchors". anchor_filter_inds = tf.where(anchor_filter) gt_boxes = tf.gather(gt_boxes, argmax_overlaps) bbox_targets = encode_tf(anchors, gt_boxes) # For the anchors that arent foreground, we ignore the bbox_targets. anchor_foreground_filter = tf.equal(labels, 1) bbox_targets = tf.where(condition=anchor_foreground_filter, x=bbox_targets, y=tf.zeros_like(bbox_targets)) # We unroll "inside anchors" value for all anchors (for shape # compatibility). # We complete the missed indices with zeros # (because scatter_nd has zeros as default). bbox_targets = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds), updates=bbox_targets, shape=tf.shape(all_anchors)) labels_scatter = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds), updates=labels, shape=[tf.shape(all_anchors)[0]]) # We have to put -1 to ignore the indices with 0 generated by # scatter_nd, otherwise it will be considered as background. labels = tf.where(condition=anchor_filter, x=labels_scatter, y=tf.to_float(tf.fill(tf.shape(labels_scatter), -1))) max_overlaps = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds), updates=max_overlaps, shape=[tf.shape(all_anchors)[0]]) return labels, bbox_targets, max_overlaps
def __init__(self, w_in, w_out, sense_dim, embedding_dim, batch_size, context_window, learning_rate, bi_w_in): max_context_length = 2 * context_window + 1 eval_mode = tf.placeholder(tf.bool, shape=[]) self.eval_mode = eval_mode self.bi_info = tf.sparse_placeholder(tf.int32) bi_info = tf.sparse_to_dense(self.bi_info.indices, self.bi_info.dense_shape, self.bi_info.values) self.lengths = tf.placeholder(tf.int32, [context_window * 2 + batch_size]) # add self here so we can feed it outside this class context_indices = tf.placeholder( tf.int32, [context_window * 2 + batch_size, max_context_length]) self.context_indices = context_indices major_weight = tf.placeholder(tf.float32) reg_weight = tf.placeholder(tf.float32) self.major_weight = major_weight self.reg_weight = reg_weight embedded_context = self.dense_lookup(w_in, context_indices) bi_embedded_context = self.sparse_lookup(bi_w_in, bi_info, self.lengths) # Combine bilingual contextual information embedded_context = tf.cond( eval_mode, lambda: tf.identity(embedded_context), lambda: tf.add(major_weight * embedded_context, (1 - major_weight) * bi_embedded_context)) # [(context_window*2+batch_size), sense_dim, embedding_dim] embedded_word_output = tf.nn.embedding_lookup( w_out, context_indices[:, context_window]) # shape = [(context_window*2+batch_size), sense_dim, 1] sense_score = tf.matmul(embedded_word_output, embedded_context) # [(context_window*2+batch_size), sense_dim] sense_score = tf.squeeze(sense_score) # [context_window*2+batch_size] sense_greedy = tf.argmax(sense_score, 1) self.sense_greedy = sense_greedy target_sense_sampled_indices = tf.placeholder(tf.int32, [batch_size]) self.target_sense_sampled_indices = target_sense_sampled_indices # [batch_size] reward_prob = tf.placeholder(tf.float32, [batch_size], name='reward_logit') self.reward_prob = reward_prob # [(context_window*2+batch_size), sense_dim] sense_prob = tf.nn.softmax(sense_score) self.sense_prob = sense_prob entropy = -tf.multiply(tf.log(sense_prob + 1e-8), sense_prob) entropy = tf.reduce_sum(entropy) * reg_weight # [(context_window*2+batch_size)* sense_dim] sense_score = tf.reshape( sense_score, [(context_window * 2 + batch_size) * sense_dim]) # [batch_size] sense_selected_logit_input = tf.gather(sense_score, target_sense_sampled_indices) # [batch_size, sense_dim] cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=sense_selected_logit_input, labels=reward_prob)) cost += entropy self.print_cost = cost self.print_ent = entropy optimizer = tf.train.GradientDescentOptimizer(learning_rate) self.update = optimizer.minimize(cost)
def train_model(self, sparse_length, k_sparse, debug_level=0): print("Running with sparse_length=" + str(sparse_length) + " and k_sparse=" + str(k_sparse)) # This is a bit untidy. Can be cleaned up later batch = 50 signal_dim = self.signal_dim # if debug_level > 0: print(signal_dim) model_components = {} x = tf.placeholder(tf.float32, [None, signal_dim[0], signal_dim[1]]) batch_size = tf.placeholder(tf.int32) if debug_level > 0: print(x.shape) # W = tf.Variable(tf.truncated_normal([signal_dim[1], sparse_length], stddev=1e-1), name='weights') b = tf.Variable(tf.constant(0.0, shape=[sparse_length], dtype=tf.float32), trainable=True, name='biases') x_2d = tf.reshape(x, [-1, signal_dim[1]]) z = tf.matmul(x_2d, W) + b if debug_level > 0: print(W.shape, b.shape, x_2d.shape, z.shape) # tao, tao_indices = tf.nn.top_k(z, k=k_sparse, sorted=True) indices_range = tf.expand_dims(tf.range(0, batch * signal_dim[0]), 1) range_repeated = tf.tile(indices_range, [1, k_sparse]) if debug_level > 0: print(tao, tao_indices, indices_range, range_repeated) full_indices = tf.concat([ tf.expand_dims(range_repeated, 2), tf.expand_dims(tao_indices, 2) ], axis=2) full_indices = tf.reshape(full_indices, [-1, 2]) mask = tf.ones(tf.shape(full_indices)[0]) #mask = tf.SparseTensor(tf.ones(tf.shape(full_indices)[0]),dense_shape=tf.constant([signal_dim[0]*batch,sparse_length])) tao_mask = tf.sparse_to_dense( full_indices, tf.constant([signal_dim[0] * batch, sparse_length]), mask, validate_indices=False) #tao_mask = tf.sparse_to_dense(full_indices,tf.constant([signal_dim[0]*batch,sparse_length]), mask) # z_tao = tf.multiply(tao_mask, z) # b_dash = tf.Variable(tf.constant(0.0, shape=[signal_dim[1]], dtype=tf.float32), trainable=True, name='biases') x_recons = tf.matmul(z_tao, tf.transpose(W)) + b_dash # error = tf.losses.mean_squared_error(x_2d, x_recons) # optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(error) # sess = tf.Session() #if debug_level>0: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.run(tf.global_variables_initializer()) # for e in range(1, self.N_epochs): sess.run(train_step, feed_dict={ x: self.train_set, batch_size: self.train_batch_size }) #### inp should be replaced by batch if e % self.VALIDATION_AFTER_EPOCHS == 0: err = sess.run(error, feed_dict={ x: self.val_set, batch_size: self.test_batch_size }) ### inp should be replaced by val_set print("Epoch ", e, " has val error : ", err) # x_recons_data_train = sess.run(x_recons, feed_dict={x: self.train_set}) x_recons_train_list = np.split(x_recons_data_train, self.n_samples_test, axis=0) # x_recons_data_val = sess.run(x_recons, feed_dict={x: self.val_set}) x_recons_test_list = np.split(x_recons_data_val, self.n_samples_test, axis=0) # model_components['W'] = sess.run(W) model_components['b'] = sess.run(b) model_components['tao'] = sess.run(tao, feed_dict={x: self.val_set}) model_components['tao_indices'] = sess.run(tao_indices, feed_dict={x: self.val_set}) model_components['b_dash'] = sess.run(b_dash) model_components['err'] = err model_components['x_recons_train_list'] = x_recons_train_list model_components['x_recons_test_list'] = x_recons_test_list return model_components
def true_segments_1d(segments, mode=SegmentsMode.CENTERS, max_gap=0, min_length=0, name=None): """Labels contiguous True runs in segments. Args: segments: 1D boolean tensor. mode: The SegmentsMode. Returns the start of each segment (STARTS), or the rounded center of each segment (CENTERS). max_gap: Fill gaps of length at most `max_gap` between true segments. int. min_length: Minimum length of a returned segment. int. name: Optional name for the op. Returns: run_centers: int32 tensor. Depending on `mode`, either the start of each True run, or the (rounded) center of each True run. run_lengths: int32; the lengths of each True run. """ with tf.name_scope(name, "true_segments", [segments]): segments = tf.convert_to_tensor(segments, tf.bool) run_starts, run_lengths = _segments_1d(segments, mode=SegmentsMode.STARTS) # Take only the True runs. After whichever run is True first, the True runs # are every other run. first_run = tf.cond( # First value is False, or all values are False. Handles empty segments # correctly. tf.logical_or(tf.reduce_any(segments[0:1]), ~tf.reduce_any(segments)), lambda: tf.constant(0), lambda: tf.constant(1)) num_runs = tf.shape(run_starts)[0] run_nums = tf.range(num_runs) is_true_run = tf.equal(run_nums % 2, first_run % 2) # Find gaps between True runs that can be merged. is_gap = tf.logical_and( tf.not_equal(run_nums % 2, first_run % 2), tf.logical_and(tf.greater(run_nums, first_run), tf.less(run_nums, num_runs - 1))) fill_gap = tf.logical_and(is_gap, tf.less_equal(run_lengths, max_gap)) # Segment the consecutive runs of True or False values based on whether they # are True, or are a gap of False values that can be bridged. Then, flatten # the runs of runs. runs_to_merge = tf.logical_or(is_true_run, fill_gap) run_of_run_starts, _ = _segments_1d(runs_to_merge, mode=SegmentsMode.STARTS) # Get the start of every new run from the original run starts. merged_run_starts = tf.gather(run_starts, run_of_run_starts) # Make an array mapping the original runs to their run of runs. Increment # the number for every run of run start except for the first one, so that # the array has values from 0 to num_run_of_runs. merged_run_inds = tf.cumsum( tf.sparse_to_dense( sparse_indices=tf.cast(run_of_run_starts[1:, None], tf.int64), output_shape=tf.cast(num_runs[None], tf.int64), sparse_values=tf.ones_like(run_of_run_starts[1:]))) # Sum the lengths of the original runs that were merged. merged_run_lengths = tf.segment_sum(run_lengths, merged_run_inds) if mode is SegmentsMode.CENTERS: merged_starts_or_centers = (merged_run_starts + tf.floordiv(merged_run_lengths - 1, 2)) else: merged_starts_or_centers = merged_run_starts # If there are no true values, increment first_run to 1, so we will skip # the single (false) run. first_run += tf.to_int32(tf.logical_not(tf.reduce_any(segments))) merged_starts_or_centers = merged_starts_or_centers[first_run::2] merged_run_lengths = merged_run_lengths[first_run::2] # Only take segments at least min_length long. is_long_enough = tf.greater_equal(merged_run_lengths, min_length) is_long_enough.set_shape([None]) merged_starts_or_centers = tf.boolean_mask(merged_starts_or_centers, is_long_enough) merged_run_lengths = tf.boolean_mask(merged_run_lengths, is_long_enough) return merged_starts_or_centers, merged_run_lengths
def build_inference(self, x, flag="train"): # 设置regularizer,本别对应网络的四个部分 regularizer1 = self.param_dict[ "regulerizer1"] if flag == "train" else None regularizer2 = self.param_dict[ "regulerizer2"] if flag == "train" else None regularizer3 = self.param_dict[ "regulerizer3"] if flag == "train" else None regularizer4 = self.param_dict[ "regulerizer4"] if flag == "train" else None is_train = True if flag == "train" else False # 先获取需要的参数 hash_size = self.param_dict['hash_size'] no_hash = self.param_dict["no_hash"] embed_size = self.param_dict["embed_size"] # 根据配置获取激活函数 act_fn = self.get_activation_func(is_train) # 是否启用mini-batch aware regularization is_mba_reg = self.param_dict["is_mba_reg"] lambda_reg_mba = self.param_dict["lambda_reg_mba"] is_action_mba_reg = self.param_dict["is_action_mba_reg"] # 将输入划分 x_feature = x[:, :-3] x_action_lists = x[:, -3:] # 先将稀疏特征转换成indice x_sparse = [] for i in range(len(hash_size)): if i in no_hash: # 这部分特征本身可以直接作为indice,不需要转化 x_i = tf.string_to_number(x_feature[:, i], tf.int32) x_sparse.append(x_i) else: # 这部分特征可以通过哈希函数来转化成index x_i = tf.string_to_hash_bucket_strong( input=x_feature[:, i], num_buckets=hash_size[i], key=[679362, 964545], name="sparse_feature_{}".format(i)) x_sparse.append(x_i) # 将稀疏数据转换成embedding向量 x_embed = [] w_action_embed = [] x_action = [] indice_sku_cate_brand = [] sku_cate_brand_index = self.param_dict["sku_cate_brand_index"] for i in range(len(embed_size)): if embed_size[i] != -1: with tf.variable_scope("embedding_{}".format(i)): if hash_size[i] <= 500000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]])) elif hash_size[i] > 500000 and hash_size[i] <= 5000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(5, 0)) elif hash_size[i] > 5000000 and hash_size[i] <= 10000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) elif hash_size[i] > 10000000 and hash_size[i] <= 15000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(15, 0)) elif hash_size[i] > 15000000 and hash_size[i] <= 20000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(20, 0)) else: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(30, 0)) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 w_action_embed.append(weights) x_action.append(x_i) indice_sku_cate_brand.append(x_sparse[i]) if is_train and is_mba_reg and not is_action_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) else: if is_train and is_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) else: x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) x_embed.append(x_i) # if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 # with tf.variable_scope("embedding_{}".format(i)): # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]), # partitioner=tf.fixed_size_partitioner(20, 0)) # w_action_embed.append(weights) # x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) # if is_train and is_mba_reg and not is_action_mba_reg: # # 计算mba # self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba) # # indice_sku_cate_brand.append(x_sparse[i]) # x_embed.append(x_i) # x_action.append(x_i) # else: # if embed_size[i] != -1: # with tf.variable_scope("embedding_{}".format(i)): # if i == 0: # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]), # partitioner=tf.fixed_size_partitioner(20, 0)) # else: # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]])) # x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) # if is_train and is_mba_reg: # # 计算mba # self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba) # # x_embed.append(x_i) # else: # x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) # x_embed.append(x_i) x_embed = tf.concat(x_embed, 1) # 对浏览行为建模,构建DIN with tf.name_scope("user_behaviours"): x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [ -1, ]) x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [ -1, ]) x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [ -1, ]) browse_lists = [ x_browse_skus_list, x_browse_cates_list, x_browse_brand_list ] browse_names = ['skus', 'cates', 'brands'] browse_nums = self.param_dict["browse_nums"] x_action_list_embeds = [] sum_poolings = [] x_action_list_masks = [] for i in range(len(browse_names)): # for i in [0]: with tf.name_scope("user_browse_{}_embedding".format( browse_names[i])): browse_w_embed = w_action_embed[i] # x_ad_embedded = x_action[i] x_browse_action = browse_lists[ i] # shape of x_browse_action is [?,] x_browse_action_list = tf.string_split( x_browse_action, "#") x_browse_action_list_indices = tf.sparse_to_dense( x_browse_action_list.indices, # x_browse_action_list.dense_shape, [x_browse_action_list.dense_shape[0], browse_nums[i]], tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape()[0].value, key=[679362, 964545], name="sparse_user_browse_{}".format( browse_names[i])), -1) indice_mask = tf.reshape( tf.not_equal(x_browse_action_list_indices, -1), [-1, browse_nums[i]]) x_action_list_masks.append(indice_mask) x_action_list_embed = tf.reshape( tf.nn.embedding_lookup(browse_w_embed, x_browse_action_list_indices), [ -1, browse_nums[i], browse_w_embed.get_shape()[1].value ]) if is_train and is_action_mba_reg: # 计算mba indice_action = tf.concat([ tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545]), indice_sku_cate_brand[i] ], 0) self.calculate_mini_batch_aware_reg( browse_w_embed, indice_action, lambda_reg_mba) x_action_list_embeds.append(x_action_list_embed) with tf.name_scope("activation_unit"): act_unit_hidden_layers = self.param_dict[ "act_unit_hidden_layers"] action_indexs = self.param_dict["action_indexs"] # for i in range(len(x_action_list_embeds)): for i in action_indexs: x_action_list_embed = x_action_list_embeds[i] x_ad_embedded = x_action[i] indice_mask = x_action_list_masks[i] # 外积:笛卡尔积矩阵拉平向量 # out_product_list = tf.map_fn(lambda action_emb: tf.reshape(tf.matmul(tf.expand_dims(action_emb, 2), tf.expand_dims(x_ad_embedded, 1)), [-1, x_ad_embedded.shape[1].value ** 2]), # tf.transpose(x_action_list_embed, [1, 0, 2])) # 近似外积:向量相减再concat向量点积 x_action_list_embed_new = tf.transpose( x_action_list_embed, [1, 0, 2]) concat_list = [ tf.concat([ x_action_list_embed_new[ii], x_action_list_embed_new[ii] - x_ad_embedded, x_action_list_embed_new[ii] * x_ad_embedded, x_ad_embedded ], 1) for ii in range(x_action_list_embed_new.shape[0].value) ] act_unit_in = concat_list[0].shape[1].value act_in = concat_list with tf.variable_scope("activation_unit_{}_list".format( browse_names[i])): for ii in range(len(act_unit_hidden_layers)): weights_act_unit = self.get_weight_variable( [act_unit_in, act_unit_hidden_layers[ii]], regularizer3, self.param_dict["initializer_act_unit_w"]( [act_unit_in, act_unit_hidden_layers[ii]]), name='_act_unit_w_{}'.format(ii)) biases_act_unit = tf.get_variable( "biases_{}_act_unit".format(ii), [act_unit_hidden_layers[ii]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) act_out = list( map( lambda act_in_i: act_fn( tf.matmul(act_in_i[0], weights_act_unit ) + biases_act_unit, name="act_func_{}_{}".format( ii, act_in_i[1])), zip(act_in, range(len(act_in))))) # act_out = [tf.expand_dims(act_fn(tf.matmul(act_in[ii], weights_act_unit) + biases_act_unit, name="act_func_{}_{}".format(i, ii)), 0) # for ii in range(act_in.shape[0].value)] act_in = act_out act_unit_in = act_in[0].shape[1].value act_output_in = act_in act_output_unit = act_unit_in weights_act_unit_output = self.get_weight_variable( [act_output_unit, 1], regularizer3, self.param_dict["initializer_act_unit_w"]( [act_output_unit, 1]), name='_act_unit_output_w') biases_act_unit_output = tf.get_variable( "biases_act_unit_output", [1], initializer=tf.constant_initializer(0.0), dtype=tf.float32) act_output_out = tf.concat( list( map( lambda act_output_i: tf.expand_dims( tf.matmul(act_output_i, weights_act_unit_output) + biases_act_unit_output, 0), act_output_in)), 0) # act_output_out = tf.concat([tf.expand_dims(tf.matmul(act_output_in[iii], weights_act_unit_output) + biases_act_unit_output, 0) for iii in range(act_output_in.shape[0].value)], 0) active_weight_score = tf.transpose(act_output_out, [1, 0, 2]) # 将空缺行为的权重设置为0.0 padding = tf.zeros_like(active_weight_score) active_weight_score_t = tf.where( tf.expand_dims(indice_mask, 2), active_weight_score, padding) with tf.name_scope("weight_sum_pooling"): sum_pooling = tf.reduce_sum( x_action_list_embed * active_weight_score_t, 1) sum_poolings.append(sum_pooling) x_deep_in = tf.concat([x_embed, tf.concat(sum_poolings, 1)], 1) # 构建deep模块 with tf.name_scope("deep_network"): deep_layers = self.param_dict["deep_layers"] for i in range(len(deep_layers)): with tf.variable_scope("dnn_layer_{}".format(i)): weights = self.get_weight_variable( [x_deep_in.shape[1].value, deep_layers[i]], regularizer2, self.param_dict["initializer_dnn_w"]( [x_deep_in.shape[1].value, deep_layers[i]])) biases = tf.get_variable( "biases", [deep_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases, name="deep_mlp_{}".format(i)) x_deep_in = layer_i # 构建输出模块full connect x_fc_in = x_deep_in with tf.name_scope("fc_layers"): fc_layers = self.param_dict['fc_layers'] for i in range(len(fc_layers)): with tf.variable_scope("fc_layers_{}".format(i)): weights = self.get_weight_variable( [x_fc_in.shape[1].value, fc_layers[i]], regularizer4, self.param_dict["initializer_fc_w"]( [x_fc_in.shape[1].value, fc_layers[i]])) biases = tf.get_variable( "biases", [fc_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = tf.nn.sigmoid( tf.matmul(x_fc_in, weights) + biases) x_fc_in = layer_i logit = x_fc_in return logit
trainable=False) embed = tf.nn.embedding_lookup(embeddings, questions) # Define weights weights = { # Hidden layer weights => 2*n_hidden because of foward + backward cells 'out': tf.Variable(tf.random_normal([2 * FLAGS.n_hidden, FLAGS.n_classes])) } biases = {'out': tf.Variable(tf.random_normal([FLAGS.n_classes]))} pred = model(FLAGS.n_hidden, embed, weights, biases, FLAGS.attention_size) indices = tf.expand_dims(tf.range(0, FLAGS.batch_size, 1), 1) concated = tf.concat([indices, labels], 1) labels = tf.sparse_to_dense(concated, tf.stack([FLAGS.batch_size, FLAGS.n_classes]), 1.0, 0.0) # Define loss and optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=labels)) global_step = tf.identity(tf.Variable(0, trainable=False)) # optimizer = tf.train.AdamOptimizer( # learning_rate=cyclic_learning_rate(global_step, learning_rate=FLAGS.learning_rate)).minimize(cost) optimizer = tf.train.AdamOptimizer(learning_rate=tf.train.exponential_decay( FLAGS.learning_rate, global_step, 100000, 0.96)).minimize(cost) # Evaluate model tags = tf.argmax(labels, 1) y_pred_cls = tf.argmax(tf.nn.softmax(pred), 1) correct_pred = tf.equal(tf.argmax(pred, 1), tags)
def _create_network(self): # Initialize autoencode network weights and biases network_weights = self._initialize_weights(**self.network_architecture) start_token_tensor = tf.constant( (np.zeros([self.batch_size, binary_dim])).astype(np.float32), dtype=tf.float32) self.network_weights = network_weights seqlen = tf.cast(tf.reduce_sum(self.mask, reduction_indices=-1), tf.int32) KLD_penalty = tf.tanh(tf.cast(self.timestep, tf.float32) / 1600.0) # Use recognition network to determine mean and # (log) variance of Gaussian distribution in latent # space if not same_embedding: input_embedding, input_embedding_KLD_loss = self._get_input_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['input_meaning']) else: input_embedding, input_embedding_KLD_loss = self._get_input_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['LSTM']) state = self.lstm.zero_state(self.batch_size, dtype=tf.float32) loss = 0 self.debug = 0 probs = [] with tf.variable_scope("RNN"): for i in range(self.network_architecture['maxlen']): if i > 0: # current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias if form3: current_embedding, KLD_loss = self._get_word_embedding( [network_weights['LSTM']], network_weights['input_meaning'], self.caption_placeholder[:, i - 1, :], logit=True) elif form2: current_embedding, KLD_loss = self._get_word_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['LSTM'], self.caption_placeholder[:, i - 1, :], logit=True) else: current_embedding, KLD_loss = self._get_word_embedding( [ network_weights['variational_encoding'], network_weights['biases_variational_encoding'] ], network_weights['LSTM'], self.caption_placeholder[:, i - 1]) if transfertype2: current_embedding = tf.stop_gradient(current_embedding) loss += tf.reduce_sum( KLD_loss * self.mask[:, i]) * KLD_penalty else: current_embedding = input_embedding if i > 0: tf.get_variable_scope().reuse_variables() out, state = self.lstm(current_embedding, state) if i > 0: if not form2: labels = tf.expand_dims(self.caption_placeholder[:, i], 1) ix_range = tf.range(0, self.batch_size, 1) ixs = tf.expand_dims(ix_range, 1) concat = tf.concat([ixs, labels], 1) onehot = tf.sparse_to_dense( concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) else: onehot = self.caption_placeholder[:, i, :] logit = tf.matmul( out, network_weights['LSTM']['encoding_weight'] ) + network_weights['LSTM']['encoding_bias'] if not use_ctc: if form2: # best_word=tf.nn.softmax(logit) # best_word=tf.round(best_word) # all_the_f_one_h.append(best_word) xentropy = tf.nn.sigmoid_cross_entropy_with_logits( logits=logit, labels=onehot) xentropy = tf.reduce_sum(xentropy, reduction_indices=-1) else: xentropy = tf.nn.softmax_cross_entropy_with_logits( logits=logit, labels=onehot) xentropy = xentropy * self.mask[:, i] xentropy = tf.reduce_sum(xentropy) self.debug += xentropy loss += xentropy else: probs.append(tf.expand_dims(tf.nn.sigmoid(logit), 1)) if not use_ctc: loss_ctc = 0 self.debug = self.debug / tf.reduce_sum(self.mask[:, 1:]) else: probs = tf.concat(probs, axis=1) self.debug = probs[0, 2] probs = ctc_loss.get_output_probabilities( probs, self.caption_placeholder[:, 1:, :]) loss_ctc = ctc_loss.loss( probs, self.caption_placeholder[:, 1:, :], self.network_architecture['maxlen'] - 2, self.batch_size, seqlen - 1) # self.debug=tf.reduce_sum(input_embedding_KLD_loss)/self.batch_size*KLD_penalty+loss_ctc loss = (loss / tf.reduce_sum(self.mask[:, 1:])) + tf.reduce_sum( input_embedding_KLD_loss ) / self.batch_size * KLD_penalty + loss_ctc self.loss = loss
def build_model(self, video, video_mask, caption, caption_1, caption_mask): caption_mask = tf.cast(caption_mask, tf.float32) video_mask = tf.cast(video_mask, tf.float32) # for decoding video_flat = tf.reshape(video, [-1, self.dim_image]) # (b x nv) x d image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b) # (b x nv) x h image_emb = tf.reshape(image_emb, [self.batch_size, self.n_video_steps, self.dim_hidden]) # b x nv x h c_init = tf.zeros([self.batch_size, self.dim_hidden]) # b x h m_init = tf.zeros([self.batch_size, self.dim_hidden]) # b x h state2 = (c_init, m_init) # 2 x b x h ######## Encoding Stage ######### # encoding video # mean pooling && mapping into (-1, 1) range output1 = tf.nn.tanh(tf.reduce_mean(image_emb, axis=1)) # b x h # encoding sentence with tf.variable_scope("model") as scope: for i in xrange(self.n_caption_steps): if i > 0: scope.reuse_variables() with tf.variable_scope("LSTM2"): with tf.device(cpu_device): current_embed = tf.nn.embedding_lookup(self.Wemb, caption_1[:,i]) # b x h output2, state2 = self.lstm2_dropout(current_embed, state2) # b x h ######## Encoding Stage ######### ######## Semantic Learning Stage ######## input_state = tf.concat([output1, output2], 1) # b x (2 * h) loss_latent, output_semantic = self.vae(input_state) ######## Semantic Learning Stage ######## ####### tied loss ########## sh_pred = tf.tanh(tf.nn.xw_plus_b(output1, self.sv_W, self.s_b)) # b x h loss_tied_1 = tf.reduce_sum(tf.square(tf.subtract(output2, sh_pred))) vh_pred = tf.tanh(tf.nn.xw_plus_b(output2, self.sv_W, self.v_b)) # b x h loss_tied_2 = tf.reduce_sum(tf.square(tf.subtract(output1, vh_pred))) loss_tied = loss_tied_1 + loss_tied_2 tf.summary.scalar('loss_tied_1', loss_tied_1) tf.summary.scalar('loss_tied_2', loss_tied_2) tf.summary.histogram('vh_pred', vh_pred) tf.summary.histogram('sh_pred', sh_pred) ####### tied loss ########## ######## Decoding Stage ########## state3 = (c_init, m_init) # 2 x b x h state4 = (c_init, m_init) # 2 x b x h current_embed = tf.zeros([self.batch_size, self.dim_hidden]) # b x h video_prev = tf.zeros([self.batch_size, self.dim_hidden]) loss_caption = 0.0 loss_video = 0.0 ## decoding sentence without attention with tf.variable_scope("model") as scope: with tf.variable_scope("LSTM3"): _, state3 = self.lstm3_dropout(output_semantic, state3) # b x h for i in xrange(n_caption_steps): scope.reuse_variables() with tf.variable_scope("LSTM3"): output3, state3 = self.lstm3_dropout(current_embed, state3) # b x h labels = tf.expand_dims(caption[:,i], 1) # b x 1 indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) # b x 1 concated = tf.concat([indices, labels], 1) # b x 2 onehot_labels = tf.sparse_to_dense(concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) # b x w with tf.device(cpu_device): current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:,i]) logit_words = tf.nn.xw_plus_b(output3, self.embed_word_W, self.embed_word_b) # b x w cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = logit_words, labels = onehot_labels) # b x 1 cross_entropy = cross_entropy * caption_mask[:,i] # b x 1 loss_caption += tf.reduce_sum(cross_entropy) # 1 ## decoding video without attention with tf.variable_scope("model") as scope: ## TODO: add attention for video decoding ## write into memory first with tf.variable_scope("LSTM4"): _, state4 = self.lstm4_dropout(output_semantic, state4) for i in xrange(self.n_video_steps): scope.reuse_variables() with tf.variable_scope("LSTM4"): output4, state4 = self.lstm4_dropout(video_prev, state4) decode_image = tf.nn.xw_plus_b(output4, self.decode_image_W, self.decode_image_b) # b x d_im video_prev = image_emb[:, i, :] # b x h euclid_loss = tf.reduce_sum(tf.square(tf.subtract(decode_image, video[:,i,:])), axis=1, keep_dims=True) # b x 1 euclid_loss = euclid_loss * video_mask[:, i] # b x 1 loss_video += tf.reduce_sum(euclid_loss) # 1 loss_caption = loss_caption / tf.reduce_sum(caption_mask) loss_video = loss_video / tf.reduce_sum(video_mask) loss = tf.constant(caption_weight) * loss_caption + tf.constant(video_weight) * loss_video + \ tf.constant(latent_weight) * loss_latent + tf.constant(tied_weight) * loss_tied return loss, loss_caption, loss_tied, loss_latent, loss_video, output_semantic, output1, output2
def build_input(dataset, data_path, batch_size, mode): """Build CIFAR image and labels. Args: dataset: Either 'cifar10' or 'cifar100'. data_path: Filename for raw_data. batch_size: Input batch size. mode: Either 'train' or 'eval'. Returns: images: Batches of images. [batch_size, image_size, image_size, 3] labels: Batches of labels. [batch_size, num_classes] Raises: ValueError: when the specified dataset is not supported. """ image_size = 32 if dataset == 'cifar10': label_bytes = 1 label_offset = 0 num_classes = 10 elif dataset == 'cifar100': label_bytes = 1 label_offset = 1 num_classes = 100 else: raise ValueError('Not supported dataset %s', dataset) depth = 3 image_bytes = image_size * image_size * depth record_bytes = label_bytes + label_offset + image_bytes data_files = tf.gfile.Glob(data_path) file_queue = tf.train.string_input_producer(data_files, shuffle=True) # Read examples from files in the filename queue. reader = tf.FixedLengthRecordReader(record_bytes=record_bytes) _, value = reader.read(file_queue) # Convert these examples to dense labels and processed images. record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes]) label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32) # Convert from string to [depth * height * width] to [depth, height, width]. depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]), [depth, image_size, image_size]) # Convert from [depth, height, width] to [height, width, depth]. image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32) if mode == 'train': image = tf.image.resize_image_with_crop_or_pad( image, image_size+4, image_size+4) image = tf.random_crop(image, [image_size, image_size, 3]) image = tf.image.random_flip_left_right(image) # Brightness/saturation/constrast provides small gains .2%~.5% on cifar. # image = tf.image.random_brightness(image, max_delta=63. / 255.) # image = tf.image.random_saturation(image, lower=0.5, upper=1.5) # image = tf.image.random_contrast(image, lower=0.2, upper=1.8) image = tf.image.per_image_standardization(image) example_queue = tf.RandomShuffleQueue( capacity=16 * batch_size, min_after_dequeue=8 * batch_size, dtypes=[tf.float32, tf.int32], shapes=[[image_size, image_size, depth], [1]]) num_threads = 16 else: image = tf.image.resize_image_with_crop_or_pad( image, image_size, image_size) image = tf.image.per_image_whitening(image) example_queue = tf.FIFOQueue( 3 * batch_size, dtypes=[tf.float32, tf.int32], shapes=[[image_size, image_size, depth], [1]]) num_threads = 1 example_enqueue_op = example_queue.enqueue([image, label]) tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner( example_queue, [example_enqueue_op] * num_threads)) # Read 'batch' labels + images from the example queue. images, labels = example_queue.dequeue_many(batch_size) labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) labels = tf.sparse_to_dense( tf.concat(1, [indices, labels]), [batch_size, num_classes], 1.0, 0.0) assert len(images.get_shape()) == 4 assert images.get_shape()[0] == batch_size assert images.get_shape()[-1] == 3 assert len(labels.get_shape()) == 2 assert labels.get_shape()[0] == batch_size assert labels.get_shape()[1] == num_classes # Display the training images in the visualizer. tf.image_summary('images', images) return images, labels
tf.get_variable_scope().reuse_variables() accuracy_logits = inference(validate_batch_ids, validate_batch_values) validate_softmax = tf.nn.softmax(accuracy_logits) validate_batch_labels = tf.to_int64(validate_batch_labels) correct_prediction = tf.equal(tf.argmax(validate_softmax, 1), validate_batch_labels) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # Compute auc validate_batch_labels = tf.cast(validate_batch_labels, tf.int32) sparse_labels = tf.reshape(validate_batch_labels, [-1, 1]) derived_size = tf.shape(validate_batch_labels)[0] indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) concated = tf.concat(1, [indices, sparse_labels]) outshape = tf.pack([derived_size, LABEL_SIZE]) new_validate_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0) _, auc_op = tf.contrib.metrics.streaming_auc(validate_softmax, new_validate_batch_labels) # Define inference op sparse_index = tf.placeholder(tf.int64) sparse_ids = tf.placeholder(tf.int64) sparse_values = tf.placeholder(tf.float32) sparse_shape = tf.placeholder(tf.int64) inference_ids = tf.SparseTensor(sparse_index, sparse_ids, sparse_shape) inference_values = tf.SparseTensor(sparse_index, sparse_values, sparse_shape) inference_logits = inference(inference_ids, inference_values) inference_softmax = tf.nn.softmax(inference_logits) inference_op = tf.argmax(inference_softmax, 1) # Initialize saver and summary
def build_model(self): video = tf.placeholder( tf.float32, [self.batch_size, self.n_lstm_steps, self.dim_image]) # b x n x d video_mask = tf.placeholder( tf.float32, [self.batch_size, self.n_lstm_steps]) # b x n caption = tf.placeholder(tf.int32, [self.batch_size, n_caption_step]) # b x 16 caption_mask = tf.placeholder( tf.float32, [self.batch_size, n_caption_step]) # b x 16 video_flat = tf.reshape(video, [-1, self.dim_image]) # (b x n) x d image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b) # (b x n) x h image_emb = tf.reshape( image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) # b x n x h image_emb = tf.transpose(image_emb, [1, 0, 2]) # n x b x h state1 = tf.zeros([self.batch_size, self.lstm3.state_size]) # b x s h_prev = tf.zeros([self.batch_size, self.dim_hidden]) # b x h loss_caption = 0.0 current_embed = tf.zeros([self.batch_size, self.dim_hidden]) # b x h brcst_w = tf.tile(tf.expand_dims(self.embed_att_w, 0), [self.n_lstm_steps, 1, 1]) # n x h x 1 image_part = tf.batch_matmul( image_emb, tf.tile( tf.expand_dims(self.embed_att_Ua, 0), [self.n_lstm_steps, 1, 1])) + self.embed_att_ba # n x b x h for i in range(n_caption_step): e = tf.tanh(tf.matmul(h_prev, self.embed_att_Wa) + image_part) # n x b x h e = tf.batch_matmul(e, brcst_w) # unnormalized relevance score e = tf.reduce_sum(e, 2) # n x b e_hat_exp = tf.mul(tf.transpose(video_mask), tf.exp(e)) # n x b denomin = tf.reduce_sum(e_hat_exp, 0) # b denomin = denomin + tf.to_float(tf.equal( denomin, 0)) # regularize denominator alphas = tf.tile(tf.expand_dims(tf.div(e_hat_exp, denomin), 2), [1, 1, self.dim_hidden ]) # n x b x h # normalize to obtain alpha attention_list = tf.mul(alphas, image_emb) # n x b x h atten = tf.reduce_sum( attention_list, 0) # b x h # soft-attention weighted sum if i > 0: tf.get_variable_scope().reuse_variables() with tf.variable_scope("LSTM3"): output1, state1 = self.lstm3_dropout( tf.concat(1, [atten, current_embed]), state1) # b x h output2 = tf.tanh( tf.nn.xw_plus_b(tf.concat(1, [output1, atten, current_embed]), self.embed_nn_Wp, self.embed_nn_bp)) # b x h h_prev = output1 # b x h labels = tf.expand_dims(caption[:, i], 1) # b x 1 indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) # b x 1 concated = tf.concat(1, [indices, labels]) # b x 2 onehot_labels = tf.sparse_to_dense( concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0) # b x w with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i]) logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b) # b x w cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logit_words, onehot_labels) # b x 1 cross_entropy = cross_entropy * caption_mask[:, i] # b x 1 loss_caption += tf.reduce_sum(cross_entropy) # 1 loss_caption = loss_caption / tf.reduce_sum(caption_mask) loss = loss_caption return loss, video, video_mask, caption, caption_mask
def build_model(self): video = tf.placeholder( tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image]) video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step]) caption = tf.placeholder( tf.int32, [self.batch_size, self.n_caption_lstm_step + 1]) caption_mask = tf.placeholder( tf.float32, [self.batch_size, self.n_caption_lstm_step + 1]) video_flat = tf.reshape(video, [-1, self.dim_image]) image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b) # (batch_size*n_lstm_steps, dim_hidden) image_emb = tf.reshape( image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) state1 = tf.zeros([self.batch_size, self.lstm1.state_size]) state2 = tf.zeros([self.batch_size, self.lstm2.state_size]) padding = tf.zeros([self.batch_size, self.dim_hidden]) probs = [] loss = 0.0 ############################## Encoding Stage ################################## for i in range(0, self.n_video_lstm_step): if i > 0: tf.get_variable_scope().reuse_variables() h_list = [] with tf.variable_scope("LSTM1"): output1, state1 = self.lstm1(image_emb[:, i, :], state1) h_list.append(state1) with tf.variable_scope("LSTM2"): output2, state2 = self.lstm2(tf.concat(1, [padding, output1]), state2) h_list = tf.stack(h_list, axis=1) ############################# Decoding Stage ###################################### for i in range(0, self.n_caption_lstm_step ): ## Phase 2 => only generate captions #if i == 0: # current_embed = tf.zeros([self.batch_size, self.dim_hidden]) #else: with tf.device("/gpu:0"): current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i]) tf.get_variable_scope().reuse_variables() with tf.variable_scope("LSTM1"): output1, state1 = self.lstm1(padding, state1) with tf.variable_scope("LSTM2"): output2, state2 = self.lstm2( tf.concat(1, [current_embed, output1]), state2) labels = tf.expand_dims(caption[:, i + 1], 1) indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) concated = tf.concat(1, [indices, labels]) #sparse matrix onehot_labels = tf.sparse_to_dense( concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0) #acquire output logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b) cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logit_words, onehot_labels) cross_entropy = cross_entropy * caption_mask[:, i] probs.append(logit_words) current_loss = tf.reduce_sum(cross_entropy) / self.batch_size loss = loss + current_loss return loss, video, video_mask, caption, caption_mask, probs
def build_model(self): word_vectors = tf.placeholder( tf.float32, [self.batch_size, self.n_encode_lstm_step, self.dim_wordvec]) caption = tf.placeholder( tf.int32, [self.batch_size, self.n_decode_lstm_step + 1]) caption_mask = tf.placeholder( tf.float32, [self.batch_size, self.n_decode_lstm_step + 1]) word_vectors_flat = tf.reshape(word_vectors, [-1, self.dim_wordvec]) wordvec_emb = tf.nn.xw_plus_b( word_vectors_flat, self.encode_vector_W, self.encode_vector_b ) # (batch_size*n_encode_lstm_step, dim_hidden) wordvec_emb = tf.reshape( wordvec_emb, [self.batch_size, self.n_encode_lstm_step, self.dim_hidden]) reward = tf.placeholder(tf.float32, [self.batch_size, self.n_decode_lstm_step]) state1 = tf.zeros([self.batch_size, self.lstm1.state_size]) state2 = tf.zeros([self.batch_size, self.lstm2.state_size]) padding = tf.zeros([self.batch_size, self.dim_hidden]) entropies = [] loss = 0. pg_loss = 0. # policy gradient loss ############################## Encoding Stage ################################## for i in range(0, self.n_encode_lstm_step): if i > 0: tf.get_variable_scope().reuse_variables() with tf.variable_scope("LSTM1"): output1, state1 = self.lstm1(wordvec_emb[:, i, :], state1) # states.append(state1) with tf.variable_scope("LSTM2"): output2, state2 = self.lstm2(tf.concat([padding, output1], 1), state2) ############################# Decoding Stage ###################################### for i in range(0, self.n_decode_lstm_step): with tf.device("/cpu:0"): current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i]) tf.get_variable_scope().reuse_variables() with tf.variable_scope("LSTM1"): output1, state1 = self.lstm1(padding, state1) with tf.variable_scope("LSTM2"): output2, state2 = self.lstm2( tf.concat([current_embed, output1], 1), state2) labels = tf.expand_dims(caption[:, i + 1], 1) indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) concated = tf.concat([indices, labels], 1) onehot_labels = tf.sparse_to_dense( concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b) cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logit_words, labels=onehot_labels) cross_entropy = cross_entropy * caption_mask[:, i] entropies.append(cross_entropy) pg_cross_entropy = cross_entropy * reward[:, i] pg_current_loss = tf.reduce_sum(pg_cross_entropy) / self.batch_size pg_loss = pg_loss + pg_current_loss with tf.variable_scope(tf.get_variable_scope(), reuse=False): train_op = tf.train.AdamOptimizer(self.lr).minimize(pg_loss) input_tensors = { 'word_vectors': word_vectors, 'caption': caption, 'caption_mask': caption_mask, 'reward': reward } feats = {'entropies': entropies} return train_op, pg_loss, input_tensors, feats