def query2(self, query_img_name, path='validate/pics/*/'): self.load_image_features() # Find features for query img db_keys = load_label_list(self.db) query_features = self.feature_extractor.run_inference_on_image( query_img_name, path=path) if query_features is None: return None similar_imgs = [] for i, db_name in enumerate(load_label_list(self.db)): if db_name != query_img_name: corr = np.correlate(query_features, self.db_features[db_name]) # if random.random() < 0.001: # print(corr[0]) if corr[0] >= 0.05: similar_imgs.append(db_keys[i]) # if len(similar_imgs) >= 50: # print("Breaks because of big cluster...") # break return similar_imgs
def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False): compressions = [] # Finding features diffnet = DiffNet(self.db, db_path=self.db_path) diffnet.restore(diffnet_paht) print("Calculating features for", len(cluster_images), "images") for img in cluster_images: print("Finding features for:", img) one_hot = diffnet.feedforward(img, cluster_db_path) output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.}) compressions.append(output[0]) # Clustering print("Performing clustering...") compressions = np.array(compressions) fa = FeatureAgglomeration(n_clusters=30) X_clusters = fa.fit_transform(compressions) print("Collecting data...") csv_dict_arr = [] for i, img in enumerate(cluster_images): csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]}) # Saving if save_csv: print("Saving data to csv...") keys = load_label_list(csv_dict_arr[0]) with open('cluster_result.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys, delimiter=';') dict_writer.writeheader() dict_writer.writerows(csv_dict_arr) return csv_dict_arr
def __init__(self, db, db_path='./train/pics/*/', db_features_path='./1008_features/train_db_features.pickle'): self.feature_extractor = None if db is None: self.db = pickle.load(open('./train/pickle/combined.pickle', 'rb')) else: self.db = db self.db_features = None self.db_path = db_path self.db_features_name = db_features_path self.all_labels = load_label_list(self.db) self.history_sampling_rate = 50 self.display_step = 1 self.graph = tf.Graph() # tf placeholders self.Q = None # Query input self.Y = None # One hots self.y_pred = None self.sess = None self.keep_prob = None # for dropout self.saver = None # Network Parameters self.n_input = 1008 # 1008 features from last layer in Inception-v3 self.n_output = len(self.all_labels) # Equality score for each image in db self.n_hidden_1 = 500 # 1st hidden layer self.model_name = 'diff_net.ckpt' self.cost_history = [] self.test_acc_history = []
def query(self, query_img_name, path='validate/pics/*/'): self.load_image_features() numpy_features = self.load_numpy_features(self.db_features) # Find features for query img # TODO possible to check if the img is a part of self.db and use the pre-computed features db_keys = load_label_list(self.db) query_features = self.feature_extractor.run_inference_on_image(query_img_name, path=path) if query_features is None: return None multi_query_features = np.tile(query_features, (len(numpy_features), 1)) multi_query_features = np.concatenate((multi_query_features, numpy_features), axis=1) equality_scores = self.sess.run(self.y_pred, feed_dict={self.Q: multi_query_features, self.keep_prob: 1.0}) eq_threshold = equality_scores.max() - equality_scores.std() print("MAX:", equality_scores.max(), "MIN:", equality_scores.min(), "MEAN:", equality_scores.mean(), "STD:", equality_scores.std()) similar_imgs = [] # TODO make this with np.where() for i, eq_s in enumerate(equality_scores): if eq_s[0] > eq_threshold: similar_imgs.append(db_keys[i]) # TODO predict all equality scores at once! Would possibly be much faster! # for i, pred in enumerate(self.db_features): # if pred is not None and db_keys[i] != query_img_name: # equality_score = self.sess.run(self.y_pred, feed_dict={self.Q: [query_features], self.T: [pred], # self.keep_prob: 1.0}) # if random.random() < 0.1: # print("EQ score:", equality_score) # if equality_score > 0.1: # TODO find a way to fine tune this threshold # similar_imgs.append(self.db[i]) return similar_imgs
def query(self, query_img_name, path='validate/pics/*/'): self.load_image_features() # Find features for query img # TODO possible to check if the img is a part of self.db and use the pre-computed features db_keys = load_label_list(self.db) query_features = self.feature_extractor.run_inference_on_image( query_img_name, path=path) if query_features is None: return None # query_features_indexes_max = np.where(query_features > query_features.max() * query_features.std()) select_max_features = 50 query_features_indexes_max = query_features.argsort( )[-select_max_features:][::-1] # query_features_indexes_min = np.squeeze(np.where(query_features == query_features.min())) similar_imgs = [] for t in range(0, select_max_features): equality_threshold = select_max_features - t for i, test_name in enumerate(load_label_list(self.db_features)): if test_name != query_img_name: # test_features_indexes_max = np.where(test_features > test_features.max() * (1-query_features.std())) test_features_indexes_max = self.db_features[ test_name].argsort()[-select_max_features:][::-1] # test_features_indexes_min = np.squeeze(np.where(test_features == test_features.min())) nr_of_equal_features_max = np.count_nonzero( query_features_indexes_max == test_features_indexes_max) # nr_of_equal_features_min = np.sum(query_features_indexes_min == test_features_indexes_min) # if nr_of_equal_features_max > 2: # print(nr_of_equal_features_max) if nr_of_equal_features_max >= equality_threshold: similar_imgs.append(db_keys[i]) # else: # print("what") if len(similar_imgs) >= 30: print("Breaks because of big cluster...") break if len(similar_imgs) >= 30: break return similar_imgs
def __init__(self, db, db_path='./train/pics/*/', db_features_path='./1008_features/train_db_features.pickle'): self.feature_extractor = None if db is None: self.db = pickle.load(open('./train/pickle/combined.pickle', 'rb')) else: self.db = db self.db_features = None self.db_path = db_path self.db_features_name = db_features_path self.one_hot_indexes_path = './precomputed_labels/015_threshold_one_hot_indexes-' self.all_labels = load_label_list(self.db) self.history_sampling_rate = 50 self.display_step = 1 self.graph = tf.Graph() # tf placeholders self.Q = None # One-hot input self.Y = None # One-hot output self.y_pred = None self.sess = None self.keep_prob = None # for dropout self.saver = None self.compressed = None # Network Parameters self.n_input = len(load_label_list(self.db)) # Typically 100k one-hot index input self.n_output = self.n_input # The same as input self.n_hidden_1 = 500 # encoder/decoder size for the hidden layers self.n_compressed = 50 # Compressed layer (middle layer) self.model_name = 'feature_auto_encoder.ckpt' self.cost_history = [] self.test_acc_history = []
def run_inference_on_images( self, images, path='validate/pics/*/', save_name='validation_feature_embedding.pickle'): """Runs inference on multiple images. Args: images: Image file names. path: Returns: Features for those images """ preds = {} if self.sess is None: self.create_session() image_list = load_label_list(images) layer = self.sess.graph.get_tensor_by_name('softmax:0') for i, image in enumerate(image_list): if i % 1000 == 0: print("Extracting features from:", i + 1) image_full = find_img_path(path, image) if image_full is not None and tf.gfile.Exists(image_full): image_data = tf.gfile.FastGFile(image_full, 'rb').read() predictions = self.sess.run( layer, {'DecodeJpeg/contents:0': image_data}) predictions = np.squeeze(predictions) preds[image] = predictions else: # Did not find any image print("Did not find image", image, "during feature extracting!") preds[image] = None with open(save_name, 'wb') as handle: pickle.dump(preds, handle) return preds
return similar_imgs if __name__ == '__main__': # test_features_name = '/home/mikkel/deep_learning_exam/2048_features/test_db_features.npy' train_features_path = '/home/mikkel/deep_learning_exam/1008_features/train_db_features.pickle' net = DiffNet('train', db_features_path=train_features_path) test_labels = generate_dict_from_directory( pickle_file='./validate/pickle/combined.pickle', directory='./validate/txt/') train_labels = generate_dict_from_directory() all_labels = {**test_labels, **train_labels} test_ids = load_label_list(test_labels)[:1000] scores = [] for j, query_img in enumerate(test_ids): cluster = net.query2(query_img) if cluster is not None and len(cluster) > 0: score_res = score(all_labels, target=query_img, selection=cluster) scores.append(score_res) print('%05d' % j, "\t", query_img, "- cluster size:", '%04d' % len(cluster), "- score", score_res, "- avg:", reduce(lambda x, y: x + y, scores) / len(scores)) else: scores.append(0.0) print('%05d' % j, "\t", query_img, "- No similar images found...", "- avg:",
dest='small_training_set') parser.add_argument('-save_csv', default=True, type=bool, help='If the results should be saved to a csv file or not', dest='save_csv') parser.add_argument('-test_path', default="validate", help='Path to pickle files that should be clustered', dest='test_path') parser.add_argument('-restore_path', default="./feature_autoencoder/", help='Path to saved model', dest='restore_path') args = parser.parse_args() start_time = time.time() train_features_path = '1008_features/train_db_features.pickle' training_labels = pickle.load(open('./train/pickle/combined.pickle', 'rb')) net = FeaturesAutoencoder(training_labels, db_path='./train/pics/*/', db_features_path=train_features_path) if args.train: # Training for 20 epochs net.train(training_epochs=20, learning_rate=.003, batch_size=64, save=True, show_cost=True, show_example=False, save_path=args.restore_path, small_training_set=args.small_training_set) else: # Restoring pre-trained model net.restore(args.restore_path) testing_labels = pickle.load(open('./' + args.test_path + '/pickle/combined.pickle', 'rb')) cluster_lbs = load_label_list(testing_labels) cluster = net.cluster(cluster_lbs, './' + args.test_path + '/pics/*/', save_csv=args.save_csv) print("Time used:", time.time() - start_time)
def load_numpy_features(self, db_features): features = [] for img_name in load_label_list(db_features): features.append(db_features[img_name]) return np.array(features)
def train(self, training_epochs=20, learning_rate=0.01, batch_size=32, show_cost=False, show_test_acc=False, save=False, save_path='diffnet1/', logger=True): # Load and preprocess data if logger: print("Loading and preprocessing data...") X_train = load_label_list(self.db) X_test = load_label_list(self.test_db) if self.sess is None: self.build(learning_rate=learning_rate) self.load_image_features(load_test_features=True) total_batch = int(len(X_train) / batch_size) if logger: print("Starting training...") print("Total nr of batches:", total_batch) # Training cycle training_pair_counter = 0 for epoch in range(training_epochs): # Loop over all batches idexes = np.arange(len(X_train)) for i in range(total_batch): idx = np.random.choice(idexes, (int(batch_size * 2)), replace=True) # q_idx = idx[:(len(idx) - int(len(idx) / 3))] # t_idx = list(idx[(len(idx) - int(len(idx) / 3)):]) + list(q_idx[int(len(idx) / 3):]) q_idx = idx[:int(len(idx) / 2)] t_idx = idx[int(len(idx) / 2):] batch_qs = X_train[q_idx] # Query images batch_ts = X_train[t_idx] # Test images # Shuffling p = np.random.permutation(len(q_idx)) batch_qs = batch_qs[p] batch_ts = batch_ts[p] batch_qs_f = [self.db_features[x] for x in batch_qs] batch_ts_f = [self.db_features[x] for x in batch_ts] batch_qs_f, batch_ts_f, batch_ys = calculate_score_batch(batch_qs, batch_ts, self.db, q_features=batch_qs_f, t_features=batch_ts_f) # Run optimization op (backprop) and cost op (to get loss value) _, c = self.sess.run([self.optimizer, self.loss_function], feed_dict={self.Q: batch_qs_f, self.T: batch_ts_f, self.Y: batch_ys, self.keep_prob: .5}) training_pair_counter += len(batch_qs) if i % self.history_sampling_rate == 0: self.cost_history.append(c) test_idx = np.random.choice(np.arange(0, len(X_test)), batch_size) test_q_idx = test_idx[:int(len(test_idx) / 2)] test_t_idx = test_idx[int(len(test_idx) / 2):] test_batch_qs = X_test[test_q_idx] # Query images test_batch_ts = X_test[test_t_idx] # Test images test_batch_qs_f = [self.test_db_features[x] for x in test_batch_qs] test_batch_ts_f = [self.test_db_features[x] for x in test_batch_ts] test_batch_qs_f, test_batch_ts_f, test_batch_ys = calculate_score_batch(test_batch_qs, test_batch_ts, self.test_db, q_features=test_batch_qs_f, t_features=test_batch_ts_f) acc = self.sess.run(self.loss_function, feed_dict={self.Q: test_batch_qs_f, self.T: test_batch_ts_f, self.Y: test_batch_ys, self.keep_prob: 1.}) self.test_acc_history.append(acc) print("Batch index:", '%04d' % i, "Cost:", c, "Validation accuracy:", acc) # Do more precise test after each epoch if epoch % self.display_step == 0: test_idx = np.random.choice(np.arange(0, len(X_test)), 1000) test_q_idx = test_idx[:int(len(test_idx) / 2)] test_t_idx = test_idx[int(len(test_idx) / 2):] test_batch_qs = X_test[test_q_idx] # Query images test_batch_ts = X_test[test_t_idx] # Test images test_batch_qs_f = [self.test_db_features[x] for x in test_batch_qs] test_batch_ts_f = [self.test_db_features[x] for x in test_batch_ts] test_batch_qs_f, test_batch_ts_f, test_batch_ys = calculate_score_batch(test_batch_qs, test_batch_ts, self.test_db, q_features=test_batch_qs_f, t_features=test_batch_ts_f) test_accuracy = self.sess.run(self.loss_function, feed_dict={self.Q: test_batch_qs_f, self.T: test_batch_ts_f, self.Y: test_batch_ys, self.keep_prob: 1.}) # self.test_acc_history.append(test_accuracy) print("Epoch:", '%03d' % (epoch + 1), "total trained pairs:", '%09d' % training_pair_counter, "\ttest acc =", test_accuracy, "- time used:", time.time() - start_time) if save: self.saver.save(self.sess, save_path + self.model_name, global_step=epoch + 1) # Printing out some comparisons test_idx = np.random.choice(np.arange(0, len(X_test)), 200) test_q_idx = test_idx[:int(len(test_idx) / 2)] test_t_idx = test_idx[int(len(test_idx) / 2):] test_batch_qs = X_test[test_q_idx] # Query images test_batch_ts = X_test[test_t_idx] # Test images test_batch_qs_f = [self.test_db_features[x] for x in test_batch_qs] test_batch_ts_f = [self.test_db_features[x] for x in test_batch_ts] test_batch_qs_f, test_batch_ts_f, test_batch_ys = calculate_score_batch(test_batch_qs, test_batch_ts, self.test_db, q_features=test_batch_qs_f, t_features=test_batch_ts_f) output = self.sess.run(self.y_pred, feed_dict={self.Q: test_batch_qs_f, self.T: test_batch_ts_f, self.Y: test_batch_ys, self.keep_prob: 1.}) print(test_batch_ys) print(output) if show_test_acc: y_axis = np.array(self.test_acc_history) plt.plot(y_axis) plt.show() if show_cost: y_axis = np.array(self.cost_history) plt.plot(y_axis) plt.show()