def __init__(self, config): self.config = config mode = config['mode'] dataset_dir = config['dataset_dir'] initial_lr = config['initial_lr'] batch_size = config['batch_size'] final_endpoint = config['final_endpoint'] tf.logging.set_verbosity(tf.logging.INFO) self.learning_rate = tf.Variable(initial_lr, trainable=False) self.lr_rate_placeholder = tf.placeholder(tf.float32) self.lr_rate_assign = self.learning_rate.assign( self.lr_rate_placeholder) self.dataset = get_split_with_text(mode, dataset_dir) image_size = inception_v1.default_image_size images, _, texts, seq_lens, self.labels, _, _ = load_batch_with_text( self.dataset, batch_size, height=image_size, width=image_size) self.nb_emotions = self.dataset.num_classes # Create the model, use the default arg scope to configure the batch norm parameters. is_training = (mode == 'train') with slim.arg_scope(inception_v1.inception_v1_arg_scope()): self.logits, _ = inception_v1.inception_v1( images, final_endpoint=final_endpoint, num_classes=self.nb_emotions, is_training=is_training)
def oasis_evaluation(checkpoint_dir, num_classes): """Compute the logits of the OASIS dataset. Parameters: checkpoint_dir: Checkpoint of the saved model during training. num_classes: Number of classes. """ with tf.Graph().as_default(): config = _CONFIG.copy() mode = 'validation' dataset_dir = config['dataset_dir'] text_dir = config['text_dir'] emb_dir = config['emb_dir'] filename = config['filename'] initial_lr = config['initial_lr'] #batch_size = config['batch_size'] im_features_size = config['im_features_size'] rnn_size = config['rnn_size'] final_endpoint = config['final_endpoint'] tf.logging.set_verbosity(tf.logging.INFO) batch_size = 1 image_size = inception_v1.default_image_size images = tf.placeholder(tf.float32, [image_size, image_size, 3]) images_prep = inception_preprocessing.preprocess_image( images, image_size, image_size, is_training=False) images_prep_final = tf.expand_dims(images_prep, 0) texts = tf.placeholder(tf.int32, [batch_size, _POST_SIZE]) seq_lens = tf.placeholder(tf.int32, [batch_size]) # Create the model, use the default arg scope to configure the batch norm parameters. is_training = (mode == 'train') with slim.arg_scope(inception_v1.inception_v1_arg_scope()): images_features, _ = inception_v1.inception_v1( images_prep_final, final_endpoint=final_endpoint, num_classes=im_features_size, is_training=is_training) # Text model vocabulary, embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) vocab_size, embedding_dim = embedding.shape word_to_id = dict(zip(vocabulary, range(vocab_size))) # Unknown words = vector with zeros embedding = np.concatenate([embedding, np.zeros((1, embedding_dim))]) word_to_id['<ukn>'] = vocab_size vocab_size = len(word_to_id) nb_emotions = num_classes with tf.variable_scope('Text'): # Word embedding W_embedding = tf.get_variable('W_embedding', [vocab_size, embedding_dim], trainable=False) input_embed = tf.nn.embedding_lookup(W_embedding, texts) # LSTM cell = tf.contrib.rnn.BasicLSTMCell(rnn_size) rnn_outputs, final_state = tf.nn.dynamic_rnn( cell, input_embed, sequence_length=seq_lens, dtype=tf.float32) # Need to convert seq_lens to int32 for stack texts_features = tf.gather_nd( rnn_outputs, tf.stack( [tf.range(batch_size), tf.cast(seq_lens, tf.int32) - 1], axis=1)) # Concatenate image and text features concat_features = tf.concat([images_features, texts_features], axis=1) # Dense layer W_fc = tf.get_variable('W_fc', [im_features_size + rnn_size, fc_size]) b_fc = tf.get_variable('b_fc', [fc_size]) dense_layer = tf.matmul(concat_features, W_fc) + b_fc dense_layer_relu = tf.nn.relu(dense_layer) W_softmax = tf.get_variable('W_softmax', [fc_size, nb_emotions]) b_softmax = tf.get_variable('b_softmax', [nb_emotions]) logits = tf.matmul(dense_layer_relu, W_softmax) + b_softmax # Load model checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) scaffold = monitored_session.Scaffold(init_op=None, init_feed_dict=None, init_fn=None, saver=None) session_creator = monitored_session.ChiefSessionCreator( scaffold=scaffold, checkpoint_filename_with_path=checkpoint_path, master='', config=None) # Load oasis dataset df_oasis = pd.read_csv('data/oasis/OASIS.csv', encoding='utf-8') def load_image(name): im_path = 'data/oasis/images/' + name.strip() + '.jpg' one_im = imread(im_path) one_im = imresize(one_im, ((image_size, image_size, 3)))[:, :, :3] # to get rid of alpha channel return one_im df_oasis['image'] = df_oasis['Theme'].map(lambda x: load_image(x)) df_oasis['Theme'] = df_oasis['Theme'].map( lambda x: ''.join([i for i in x if not i.isdigit()]).strip()) vocabulary, embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) word_to_id = dict(zip(vocabulary, range(len(vocabulary)))) df_oasis['text_list'], df_oasis['text_len'] = zip( *df_oasis['Theme'].map(lambda x: _paragraph_to_ids( x, word_to_id, _POST_SIZE, emotions=''))) with monitored_session.MonitoredSession( session_creator=session_creator, hooks=None) as session: nb_iter = df_oasis.shape[0] / batch_size scores = [] for i in range(nb_iter): np_images = df_oasis['image'][(i * batch_size):((i + 1) * batch_size)] np_texts = np.vstack( df_oasis['text_list'][(i * batch_size):((i + 1) * batch_size)]) np_seq_lens = df_oasis['text_len'][( i * batch_size):((i + 1) * batch_size)].values print(np_images.shape) session.run(images, feed_dict={images: np_images}) print(np_texts.shape) session.run(texts, feed_dict={texts: np_texts}) print(np_seq_lens.shape) session.run(seq_lens, feed_dict={seq_lens: np_seq_lens}) #scores.append(session.run(logits, feed_dict={images: np_images, texts: np_texts, seq_lens: np_seq_lens})) scores = np.vstack(scores) np.save('data/oasis_logits.npy', scores) return scores
def word_most_relevant(top_words, num_classes, checkpoint_dir): """Compute gradient of W_embedding to get the word most relevant to a label. Parameters: checkpoint_dir: Checkpoint of the saved model during training. """ with tf.Graph().as_default(): config = _CONFIG.copy() mode = 'validation' dataset_dir = config['dataset_dir'] text_dir = config['text_dir'] emb_dir = config['emb_dir'] filename = config['filename'] initial_lr = config['initial_lr'] #batch_size = config['batch_size'] im_features_size = config['im_features_size'] rnn_size = config['rnn_size'] final_endpoint = config['final_endpoint'] tf.logging.set_verbosity(tf.logging.INFO) batch_size = 50 image_size = inception_v1.default_image_size images = tf.placeholder(tf.float32, [batch_size, image_size, image_size, 3]) texts = tf.placeholder(tf.int32, [batch_size, _POST_SIZE]) seq_lens = tf.placeholder(tf.int32, [batch_size]) # Create the model, use the default arg scope to configure the batch norm parameters. is_training = (mode == 'train') with slim.arg_scope(inception_v1.inception_v1_arg_scope()): images_features, _ = inception_v1.inception_v1( images, final_endpoint=final_endpoint, num_classes=im_features_size, is_training=is_training) # Text model vocabulary, embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) vocab_size, embedding_dim = embedding.shape word_to_id = dict(zip(vocabulary, range(vocab_size))) # Unknown words = vector with zeros embedding = np.concatenate([embedding, np.zeros((1, embedding_dim))]) word_to_id['<ukn>'] = vocab_size vocab_size = len(word_to_id) nb_emotions = num_classes with tf.variable_scope('Text'): # Word embedding W_embedding = tf.get_variable('W_embedding', [vocab_size, embedding_dim], trainable=False) input_embed = tf.nn.embedding_lookup(W_embedding, texts) # LSTM cell = tf.contrib.rnn.BasicLSTMCell(rnn_size) rnn_outputs, final_state = tf.nn.dynamic_rnn( cell, input_embed, sequence_length=seq_lens, dtype=tf.float32) # Need to convert seq_lens to int32 for stack texts_features = tf.gather_nd( rnn_outputs, tf.stack( [tf.range(batch_size), tf.cast(seq_lens, tf.int32) - 1], axis=1)) # Concatenate image and text features concat_features = tf.concat([images_features, texts_features], axis=1) # Dense layer W_fc = tf.get_variable('W_fc', [im_features_size + rnn_size, fc_size]) b_fc = tf.get_variable('b_fc', [fc_size]) dense_layer = tf.matmul(concat_features, W_fc) + b_fc dense_layer_relu = tf.nn.relu(dense_layer) W_softmax = tf.get_variable('W_softmax', [fc_size, nb_emotions]) b_softmax = tf.get_variable('b_softmax', [nb_emotions]) logits = tf.matmul(dense_layer_relu, W_softmax) + b_softmax # Initialise image #image_init = tf.random_normal([image_size, image_size, 3]) #image_init = inception_preprocessing.preprocess_image(image_init, image_size, image_size, is_training=False) #image_init = tf.expand_dims(image_init, 0) # Load model checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) scaffold = monitored_session.Scaffold(init_op=None, init_feed_dict=None, init_fn=None, saver=None) session_creator = monitored_session.ChiefSessionCreator( scaffold=scaffold, checkpoint_filename_with_path=checkpoint_path, master='', config=None) with monitored_session.MonitoredSession( session_creator=session_creator, hooks=None) as session: nb_iter = len(top_words) / batch_size scores = [] for i in range(nb_iter): np_images = np.zeros((batch_size, image_size, image_size, 3)) np_texts = np.ones((batch_size, _POST_SIZE), dtype=np.int32) * (vocab_size - 1) np_texts[:, 0] = top_words[i * batch_size:(i + 1) * batch_size] np_seq_lens = np.ones(batch_size, dtype=np.int32) scores.append( session.run(logits, feed_dict={ images: np_images, texts: np_texts, seq_lens: np_seq_lens })) scores = np.vstack(scores) np.save('data/top_words_scores.npy', scores) np.save('data/top_words.npy', top_words) return scores, vocabulary, word_to_id
def __init__(self, config): self.config = config mode = config['mode'] dataset_dir = config['dataset_dir'] text_dir = config['text_dir'] emb_dir = config['emb_dir'] filename = config['filename'] initial_lr = config['initial_lr'] batch_size = config['batch_size'] im_features_size = config['im_features_size'] rnn_size = config['rnn_size'] final_endpoint = config['final_endpoint'] fc_size = config['fc_size'] tf.logging.set_verbosity(tf.logging.INFO) self.learning_rate = tf.Variable(initial_lr, trainable=False) self.lr_rate_placeholder = tf.placeholder(tf.float32) self.lr_rate_assign = self.learning_rate.assign( self.lr_rate_placeholder) self.dataset = get_split_with_text(mode, dataset_dir) image_size = inception_v1.default_image_size images, _, texts, seq_lens, self.labels, self.post_ids, self.days = load_batch_with_text( self.dataset, batch_size, height=image_size, width=image_size) # Create the model, use the default arg scope to configure the batch norm parameters. is_training = (mode == 'train') with slim.arg_scope(inception_v1.inception_v1_arg_scope()): images_features, _ = inception_v1.inception_v1( images, final_endpoint=final_endpoint, num_classes=im_features_size, is_training=is_training) # Text model vocabulary, self.embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) vocab_size, embedding_dim = self.embedding.shape word_to_id = dict(zip(vocabulary, range(vocab_size))) # Unknown words = vector with zeros self.embedding = np.concatenate( [self.embedding, np.zeros((1, embedding_dim))]) word_to_id['<ukn>'] = vocab_size vocab_size = len(word_to_id) self.nb_emotions = self.dataset.num_classes with tf.variable_scope('Text'): # Word embedding W_embedding = tf.get_variable('W_embedding', [vocab_size, embedding_dim], trainable=False) self.embedding_placeholder = tf.placeholder( tf.float32, [vocab_size, embedding_dim]) self.embedding_init = W_embedding.assign( self.embedding_placeholder) input_embed = tf.nn.embedding_lookup(W_embedding, texts) #input_embed_dropout = tf.nn.dropout(input_embed, self.keep_prob) # LSTM cell = tf.contrib.rnn.BasicLSTMCell(rnn_size) rnn_outputs, final_state = tf.nn.dynamic_rnn( cell, input_embed, sequence_length=seq_lens, dtype=tf.float32) # Need to convert seq_lens to int32 for stack texts_features = tf.gather_nd( rnn_outputs, tf.stack( [tf.range(batch_size), tf.cast(seq_lens, tf.int32) - 1], axis=1)) # Concatenate image and text features self.concat_features = tf.concat([images_features, texts_features], axis=1) # Dense layer W_fc = tf.get_variable('W_fc', [im_features_size + rnn_size, fc_size]) b_fc = tf.get_variable('b_fc', [fc_size]) dense_layer = tf.matmul(self.concat_features, W_fc) + b_fc dense_layer_relu = tf.nn.relu(dense_layer) W_softmax = tf.get_variable('W_softmax', [fc_size, self.nb_emotions]) b_softmax = tf.get_variable('b_softmax', [self.nb_emotions]) self.logits = tf.matmul(dense_layer_relu, W_softmax) + b_softmax
def class_visualisation(label, learning_rate, checkpoint_dir): """Visualise class with gradient ascent. Parameters: label: Label to visualise. learning_rate: Learning rate of the gradient ascent. checkpoint_dir: Checkpoint of the saved model during training. """ with tf.Graph().as_default(): tf.logging.set_verbosity(tf.logging.INFO) image_size = inception_v1.default_image_size image = tf.placeholder(tf.float32, [1, image_size, image_size, 3]) # Text model text_dir = 'text_model' emb_dir = 'embedding_weights' filename = 'glove.6B.50d.txt' vocabulary, embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) vocab_size, embedding_dim = embedding.shape word_to_id = dict(zip(vocabulary, range(vocab_size))) # Create text with only unknown words text = tf.constant( np.ones((1, _POST_SIZE), dtype=np.int32) * vocab_size) im_features_size = 128 # Create the model, use the default arg scope to configure the batch norm parameters. with slim.arg_scope(inception_v1.inception_v1_arg_scope()): images_features, _ = inception_v1.inception_v1( image, num_classes=im_features_size, is_training=True) # Unknown words = vector with zeros embedding = np.concatenate([embedding, np.zeros((1, embedding_dim))]) word_to_id['<ukn>'] = vocab_size vocab_size = len(word_to_id) nb_emotions = 6 with tf.variable_scope('Text'): embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim]) # Word embedding W_embedding = tf.get_variable('W_embedding', [vocab_size, embedding_dim], trainable=False) embedding_init = W_embedding.assign(embedding_placeholder) input_embed = tf.nn.embedding_lookup(W_embedding, text) #input_embed_dropout = tf.nn.dropout(input_embed, self.keep_prob) # Rescale the mean by the actual number of non-zero values. nb_finite = tf.reduce_sum(tf.cast(tf.not_equal(input_embed, 0.0), tf.float32), axis=1) # If a post has zero finite elements, replace nb_finite by 1 nb_finite = tf.where(tf.equal(nb_finite, 0.0), tf.ones_like(nb_finite), nb_finite) h1 = tf.reduce_mean(input_embed, axis=1) * _POST_SIZE / nb_finite fc1_size = 2048 # Fully connected layer W_fc1 = tf.get_variable('W_fc1', [embedding_dim, fc1_size]) b_fc1 = tf.get_variable('b_fc1', [fc1_size]) texts_features = tf.matmul(h1, W_fc1) + b_fc1 texts_features = tf.nn.relu(texts_features) # Concatenate image and text features concat_features = tf.concat([images_features, texts_features], axis=1) W_softmax = tf.get_variable('W_softmax', [im_features_size + fc1_size, nb_emotions]) b_softmax = tf.get_variable('b_softmax', [nb_emotions]) logits = tf.matmul(concat_features, W_softmax) + b_softmax class_score = logits[:, label] l2_reg = 0.001 regularisation = l2_reg * tf.square(tf.norm(image)) obj_function = class_score - regularisation grad_obj_function = tf.gradients(obj_function, image)[0] grad_normalized = grad_obj_function / tf.norm(grad_obj_function) # Initialise image image_init = tf.random_normal([image_size, image_size, 3]) image_init = inception_preprocessing.preprocess_image( image_init, image_size, image_size, is_training=False) image_init = tf.expand_dims(image_init, 0) # Load model checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) scaffold = monitored_session.Scaffold(init_op=None, init_feed_dict=None, init_fn=None, saver=None) session_creator = monitored_session.ChiefSessionCreator( scaffold=scaffold, checkpoint_filename_with_path=checkpoint_path, master='', config=None) blur_every = 10 max_jitter = 16 show_every = 50 clip_percentile = 20 with monitored_session.MonitoredSession( session_creator=session_creator, hooks=None) as session: np_image = session.run(image_init) num_iterations = 500 for i in range(num_iterations): # Randomly jitter the image a bit ox, oy = np.random.randint(-max_jitter, max_jitter + 1, 2) np_image = np.roll(np.roll(np_image, ox, 1), oy, 2) # Update image grad_update = session.run(grad_normalized, feed_dict={image: np_image}) np_image += learning_rate * grad_update # Undo the jitter np_image = np.roll(np.roll(np_image, -ox, 1), -oy, 2) # As a regularizer, clip and periodically blur #np_image = np.clip(np_image, -0.2, 0.8) # Set pixels with small norm to zero min_norm = np.percentile(np_image, clip_percentile) np_image[np_image < min_norm] = 0.0 if i % blur_every == 0: np_image = blur_image(np_image, sigma=0.5) if i % show_every == 0 or i == (num_iterations - 1): plt.imshow(deprocess_image(np_image[0])) plt.title('Iteration %d / %d' % (i + 1, num_iterations)) plt.gcf().set_size_inches(4, 4) plt.axis('off') plt.show()