Пример #1
0
    def query2(self, query_img_name, path='validate/pics/*/'):
        self.load_image_features()

        # Find features for query img
        db_keys = load_label_list(self.db)
        query_features = self.feature_extractor.run_inference_on_image(
            query_img_name, path=path)
        if query_features is None:
            return None
        similar_imgs = []

        for i, db_name in enumerate(load_label_list(self.db)):
            if db_name != query_img_name:
                corr = np.correlate(query_features, self.db_features[db_name])
                # if random.random() < 0.001:
                #     print(corr[0])

                if corr[0] >= 0.05:
                    similar_imgs.append(db_keys[i])

            # if len(similar_imgs) >= 50:
            #     print("Breaks because of big cluster...")
            #     break

        return similar_imgs
    def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False):
        compressions = []

        # Finding features
        diffnet = DiffNet(self.db, db_path=self.db_path)
        diffnet.restore(diffnet_paht)
        print("Calculating features for", len(cluster_images), "images")
        for img in cluster_images:
            print("Finding features for:", img)
            one_hot = diffnet.feedforward(img, cluster_db_path)
            output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.})
            compressions.append(output[0])

        # Clustering
        print("Performing clustering...")
        compressions = np.array(compressions)
        fa = FeatureAgglomeration(n_clusters=30)
        X_clusters = fa.fit_transform(compressions)

        print("Collecting data...")
        csv_dict_arr = []
        for i, img in enumerate(cluster_images):
            csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]})

        # Saving
        if save_csv:
            print("Saving data to csv...")
            keys = load_label_list(csv_dict_arr[0])
            with open('cluster_result.csv', 'w') as output_file:
                dict_writer = csv.DictWriter(output_file, keys, delimiter=';')
                dict_writer.writeheader()
                dict_writer.writerows(csv_dict_arr)

        return csv_dict_arr
    def __init__(self, db, db_path='./train/pics/*/', db_features_path='./1008_features/train_db_features.pickle'):
        self.feature_extractor = None
        if db is None:
            self.db = pickle.load(open('./train/pickle/combined.pickle', 'rb'))
        else:
            self.db = db
        self.db_features = None
        self.db_path = db_path
        self.db_features_name = db_features_path

        self.all_labels = load_label_list(self.db)

        self.history_sampling_rate = 50
        self.display_step = 1

        self.graph = tf.Graph()

        # tf placeholders
        self.Q = None  # Query input
        self.Y = None  # One hots
        self.y_pred = None
        self.sess = None
        self.keep_prob = None  # for dropout
        self.saver = None

        # Network Parameters
        self.n_input = 1008  # 1008 features from last layer in Inception-v3
        self.n_output = len(self.all_labels)  # Equality score for each image in db
        self.n_hidden_1 = 500  # 1st hidden layer

        self.model_name = 'diff_net.ckpt'
        self.cost_history = []
        self.test_acc_history = []
Пример #4
0
    def query(self, query_img_name, path='validate/pics/*/'):
        self.load_image_features()

        numpy_features = self.load_numpy_features(self.db_features)

        # Find features for query img
        # TODO possible to check if the img is a part of self.db and use the pre-computed features
        db_keys = load_label_list(self.db)
        query_features = self.feature_extractor.run_inference_on_image(query_img_name, path=path)
        if query_features is None:
            return None
        multi_query_features = np.tile(query_features, (len(numpy_features), 1))
        multi_query_features = np.concatenate((multi_query_features, numpy_features), axis=1)
        equality_scores = self.sess.run(self.y_pred, feed_dict={self.Q: multi_query_features, self.keep_prob: 1.0})
        eq_threshold = equality_scores.max() - equality_scores.std()
        print("MAX:", equality_scores.max(), "MIN:", equality_scores.min(), "MEAN:", equality_scores.mean(), "STD:",
              equality_scores.std())
        similar_imgs = []

        # TODO make this with np.where()
        for i, eq_s in enumerate(equality_scores):
            if eq_s[0] > eq_threshold:
                similar_imgs.append(db_keys[i])

        # TODO predict all equality scores at once! Would possibly be much faster!
        # for i, pred in enumerate(self.db_features):
        #     if pred is not None and db_keys[i] != query_img_name:
        #         equality_score = self.sess.run(self.y_pred, feed_dict={self.Q: [query_features], self.T: [pred],
        #                                                                self.keep_prob: 1.0})
        #         if random.random() < 0.1:
        #             print("EQ score:", equality_score)
        #         if equality_score > 0.1:  # TODO find a way to fine tune this threshold
        #             similar_imgs.append(self.db[i])
        return similar_imgs
Пример #5
0
    def query(self, query_img_name, path='validate/pics/*/'):
        self.load_image_features()

        # Find features for query img
        # TODO possible to check if the img is a part of self.db and use the pre-computed features
        db_keys = load_label_list(self.db)
        query_features = self.feature_extractor.run_inference_on_image(
            query_img_name, path=path)
        if query_features is None:
            return None
        # query_features_indexes_max = np.where(query_features > query_features.max() * query_features.std())
        select_max_features = 50
        query_features_indexes_max = query_features.argsort(
        )[-select_max_features:][::-1]
        # query_features_indexes_min = np.squeeze(np.where(query_features == query_features.min()))
        similar_imgs = []
        for t in range(0, select_max_features):
            equality_threshold = select_max_features - t
            for i, test_name in enumerate(load_label_list(self.db_features)):
                if test_name != query_img_name:
                    # test_features_indexes_max = np.where(test_features > test_features.max() * (1-query_features.std()))
                    test_features_indexes_max = self.db_features[
                        test_name].argsort()[-select_max_features:][::-1]
                    # test_features_indexes_min = np.squeeze(np.where(test_features == test_features.min()))
                    nr_of_equal_features_max = np.count_nonzero(
                        query_features_indexes_max ==
                        test_features_indexes_max)
                    # nr_of_equal_features_min = np.sum(query_features_indexes_min == test_features_indexes_min)
                    # if nr_of_equal_features_max > 2:
                    #     print(nr_of_equal_features_max)
                    if nr_of_equal_features_max >= equality_threshold:
                        similar_imgs.append(db_keys[i])
                # else:
                #     print("what")

                if len(similar_imgs) >= 30:
                    print("Breaks because of big cluster...")
                    break
            if len(similar_imgs) >= 30:
                break

        return similar_imgs
    def __init__(self, db, db_path='./train/pics/*/', db_features_path='./1008_features/train_db_features.pickle'):
        self.feature_extractor = None
        if db is None:
            self.db = pickle.load(open('./train/pickle/combined.pickle', 'rb'))
        else:
            self.db = db
        self.db_features = None
        self.db_path = db_path
        self.db_features_name = db_features_path

        self.one_hot_indexes_path = './precomputed_labels/015_threshold_one_hot_indexes-'

        self.all_labels = load_label_list(self.db)

        self.history_sampling_rate = 50
        self.display_step = 1

        self.graph = tf.Graph()

        # tf placeholders
        self.Q = None  # One-hot input
        self.Y = None  # One-hot output
        self.y_pred = None
        self.sess = None
        self.keep_prob = None  # for dropout
        self.saver = None
        self.compressed = None

        # Network Parameters
        self.n_input = len(load_label_list(self.db))  # Typically 100k one-hot index input
        self.n_output = self.n_input  # The same as input
        self.n_hidden_1 = 500  # encoder/decoder size for the hidden layers
        self.n_compressed = 50  # Compressed layer (middle layer)

        self.model_name = 'feature_auto_encoder.ckpt'
        self.cost_history = []
        self.test_acc_history = []
Пример #7
0
    def run_inference_on_images(
            self,
            images,
            path='validate/pics/*/',
            save_name='validation_feature_embedding.pickle'):
        """Runs inference on multiple images.

        Args:
          images: Image file names.
          path:

        Returns:
          Features for those images
        """
        preds = {}
        if self.sess is None:
            self.create_session()

        image_list = load_label_list(images)

        layer = self.sess.graph.get_tensor_by_name('softmax:0')
        for i, image in enumerate(image_list):
            if i % 1000 == 0:
                print("Extracting features from:", i + 1)
            image_full = find_img_path(path, image)
            if image_full is not None and tf.gfile.Exists(image_full):
                image_data = tf.gfile.FastGFile(image_full, 'rb').read()
                predictions = self.sess.run(
                    layer, {'DecodeJpeg/contents:0': image_data})
                predictions = np.squeeze(predictions)
                preds[image] = predictions
            else:
                # Did not find any image
                print("Did not find image", image,
                      "during feature extracting!")
                preds[image] = None
        with open(save_name, 'wb') as handle:
            pickle.dump(preds, handle)
        return preds
Пример #8
0
        return similar_imgs


if __name__ == '__main__':
    # test_features_name = '/home/mikkel/deep_learning_exam/2048_features/test_db_features.npy'
    train_features_path = '/home/mikkel/deep_learning_exam/1008_features/train_db_features.pickle'
    net = DiffNet('train', db_features_path=train_features_path)
    test_labels = generate_dict_from_directory(
        pickle_file='./validate/pickle/combined.pickle',
        directory='./validate/txt/')

    train_labels = generate_dict_from_directory()

    all_labels = {**test_labels, **train_labels}
    test_ids = load_label_list(test_labels)[:1000]
    scores = []
    for j, query_img in enumerate(test_ids):

        cluster = net.query2(query_img)

        if cluster is not None and len(cluster) > 0:
            score_res = score(all_labels, target=query_img, selection=cluster)
            scores.append(score_res)
            print('%05d' % j, "\t", query_img, "- cluster size:",
                  '%04d' % len(cluster), "- score", score_res, "- avg:",
                  reduce(lambda x, y: x + y, scores) / len(scores))
        else:
            scores.append(0.0)
            print('%05d' % j, "\t", query_img, "- No similar images found...",
                  "- avg:",
                        dest='small_training_set')
    parser.add_argument('-save_csv', default=True, type=bool,
                        help='If the results should be saved to a csv file or not', dest='save_csv')
    parser.add_argument('-test_path', default="validate", help='Path to pickle files that should be clustered',
                        dest='test_path')
    parser.add_argument('-restore_path', default="./feature_autoencoder/", help='Path to saved model',
                        dest='restore_path')

    args = parser.parse_args()

    start_time = time.time()
    train_features_path = '1008_features/train_db_features.pickle'

    training_labels = pickle.load(open('./train/pickle/combined.pickle', 'rb'))

    net = FeaturesAutoencoder(training_labels, db_path='./train/pics/*/', db_features_path=train_features_path)

    if args.train:
        # Training for 20 epochs
        net.train(training_epochs=20, learning_rate=.003, batch_size=64, save=True, show_cost=True, show_example=False,
                  save_path=args.restore_path, small_training_set=args.small_training_set)
    else:
        # Restoring pre-trained model
        net.restore(args.restore_path)

    testing_labels = pickle.load(open('./' + args.test_path + '/pickle/combined.pickle', 'rb'))
    cluster_lbs = load_label_list(testing_labels)
    cluster = net.cluster(cluster_lbs, './' + args.test_path + '/pics/*/', save_csv=args.save_csv)

    print("Time used:", time.time() - start_time)
Пример #10
0
 def load_numpy_features(self, db_features):
     features = []
     for img_name in load_label_list(db_features):
         features.append(db_features[img_name])
     return np.array(features)
Пример #11
0
    def train(self, training_epochs=20, learning_rate=0.01, batch_size=32, show_cost=False, show_test_acc=False,
              save=False, save_path='diffnet1/', logger=True):
        # Load and preprocess data
        if logger:
            print("Loading and preprocessing data...")
        X_train = load_label_list(self.db)
        X_test = load_label_list(self.test_db)

        if self.sess is None:
            self.build(learning_rate=learning_rate)

        self.load_image_features(load_test_features=True)

        total_batch = int(len(X_train) / batch_size)
        if logger:
            print("Starting training...")
            print("Total nr of batches:", total_batch)
        # Training cycle
        training_pair_counter = 0
        for epoch in range(training_epochs):
            # Loop over all batches
            idexes = np.arange(len(X_train))
            for i in range(total_batch):
                idx = np.random.choice(idexes, (int(batch_size * 2)), replace=True)
                # q_idx = idx[:(len(idx) - int(len(idx) / 3))]
                # t_idx = list(idx[(len(idx) - int(len(idx) / 3)):]) + list(q_idx[int(len(idx) / 3):])
                q_idx = idx[:int(len(idx) / 2)]
                t_idx = idx[int(len(idx) / 2):]
                batch_qs = X_train[q_idx]  # Query images
                batch_ts = X_train[t_idx]  # Test images
                # Shuffling
                p = np.random.permutation(len(q_idx))
                batch_qs = batch_qs[p]
                batch_ts = batch_ts[p]
                batch_qs_f = [self.db_features[x] for x in batch_qs]
                batch_ts_f = [self.db_features[x] for x in batch_ts]
                batch_qs_f, batch_ts_f, batch_ys = calculate_score_batch(batch_qs, batch_ts, self.db,
                                                                         q_features=batch_qs_f, t_features=batch_ts_f)
                # Run optimization op (backprop) and cost op (to get loss value)
                _, c = self.sess.run([self.optimizer, self.loss_function],
                                     feed_dict={self.Q: batch_qs_f, self.T: batch_ts_f, self.Y: batch_ys,
                                                self.keep_prob: .5})

                training_pair_counter += len(batch_qs)
                if i % self.history_sampling_rate == 0:
                    self.cost_history.append(c)
                    test_idx = np.random.choice(np.arange(0, len(X_test)), batch_size)
                    test_q_idx = test_idx[:int(len(test_idx) / 2)]
                    test_t_idx = test_idx[int(len(test_idx) / 2):]
                    test_batch_qs = X_test[test_q_idx]  # Query images
                    test_batch_ts = X_test[test_t_idx]  # Test images
                    test_batch_qs_f = [self.test_db_features[x] for x in test_batch_qs]
                    test_batch_ts_f = [self.test_db_features[x] for x in test_batch_ts]
                    test_batch_qs_f, test_batch_ts_f, test_batch_ys = calculate_score_batch(test_batch_qs,
                                                                                            test_batch_ts, self.test_db,
                                                                                            q_features=test_batch_qs_f,
                                                                                            t_features=test_batch_ts_f)
                    acc = self.sess.run(self.loss_function, feed_dict={self.Q: test_batch_qs_f, self.T: test_batch_ts_f,
                                                                       self.Y: test_batch_ys, self.keep_prob: 1.})
                    self.test_acc_history.append(acc)
                    print("Batch index:", '%04d' % i, "Cost:", c, "Validation accuracy:", acc)

            # Do more precise test after each epoch
            if epoch % self.display_step == 0:
                test_idx = np.random.choice(np.arange(0, len(X_test)), 1000)
                test_q_idx = test_idx[:int(len(test_idx) / 2)]
                test_t_idx = test_idx[int(len(test_idx) / 2):]
                test_batch_qs = X_test[test_q_idx]  # Query images
                test_batch_ts = X_test[test_t_idx]  # Test images
                test_batch_qs_f = [self.test_db_features[x] for x in test_batch_qs]
                test_batch_ts_f = [self.test_db_features[x] for x in test_batch_ts]
                test_batch_qs_f, test_batch_ts_f, test_batch_ys = calculate_score_batch(test_batch_qs, test_batch_ts,
                                                                                        self.test_db,
                                                                                        q_features=test_batch_qs_f,
                                                                                        t_features=test_batch_ts_f)
                test_accuracy = self.sess.run(self.loss_function,
                                              feed_dict={self.Q: test_batch_qs_f, self.T: test_batch_ts_f,
                                                         self.Y: test_batch_ys, self.keep_prob: 1.})
                # self.test_acc_history.append(test_accuracy)
                print("Epoch:", '%03d' % (epoch + 1), "total trained pairs:", '%09d' % training_pair_counter,
                      "\ttest acc =", test_accuracy, "- time used:", time.time() - start_time)
                if save:
                    self.saver.save(self.sess, save_path + self.model_name, global_step=epoch + 1)

        # Printing out some comparisons
        test_idx = np.random.choice(np.arange(0, len(X_test)), 200)
        test_q_idx = test_idx[:int(len(test_idx) / 2)]
        test_t_idx = test_idx[int(len(test_idx) / 2):]
        test_batch_qs = X_test[test_q_idx]  # Query images
        test_batch_ts = X_test[test_t_idx]  # Test images
        test_batch_qs_f = [self.test_db_features[x] for x in test_batch_qs]
        test_batch_ts_f = [self.test_db_features[x] for x in test_batch_ts]
        test_batch_qs_f, test_batch_ts_f, test_batch_ys = calculate_score_batch(test_batch_qs, test_batch_ts,
                                                                                self.test_db,
                                                                                q_features=test_batch_qs_f,
                                                                                t_features=test_batch_ts_f)
        output = self.sess.run(self.y_pred, feed_dict={self.Q: test_batch_qs_f, self.T: test_batch_ts_f,
                                                       self.Y: test_batch_ys, self.keep_prob: 1.})
        print(test_batch_ys)
        print(output)

        if show_test_acc:
            y_axis = np.array(self.test_acc_history)
            plt.plot(y_axis)
            plt.show()

        if show_cost:
            y_axis = np.array(self.cost_history)
            plt.plot(y_axis)
            plt.show()