예제 #1
0
    def predict_with_model(path, name, cmodel, get_data_and_query):
        print(f'predicting with {name}')

        create_dir(f'{path}/{name}')

        with open(f'{path}/{name}/label_query_vector',
                  'wb') as label_query_vector:
            reader = read_from_pickle(f'{path}/processed')
            for coll in iter_by_batch(reader, batch_size):
                print(f'iterating ------ {folder}')
                unzipped = list(zip(*coll))
                data, queries = get_data_and_query(unzipped)
                for i in zip(cmodel.predict(data), queries, data):
                    pickle.dump(i, label_query_vector)

        create_dir(f'{path}/cluster_{name}')
        create_dir(f'{path}/cluster_{name}_dump')

        for label, query, vector in read_from_pickle(
                f'{path}/{name}/label_query_vector'):
            with open(f'{path}/cluster_{name}/{label}', 'a') as f:
                f.write(f'{query}\n')

            with open(f'{path}/cluster_{name}_dump/{label}', 'ab') as fh:
                pickle.dump(vector, fh)

        coll = list(read_from_pickle(f'{path}/{name}/label_query_vector'))
        unzipped = list(zip(*coll))
        labels = unzipped[0]
        vectors = unzipped[2]

        with open(f'{path}/{name}/silhouette', 'w') as f:
            f.write(str(silhouette_score(vectors, labels)))
예제 #2
0
def execute_task5(request):
    l = int(request.POST.get('number_of_layers'))
    k = int(request.POST.get('number_of_hashes_per_layer'))
    lsh = LSH(k=k, l=l)
    dbconnection = DatabaseConnection()

    if read_from_pickle('all_img_features_LSH.pickle') != None:
        all_image_hog_features = read_from_pickle('all_img_features_LSH.pickle')
    else:
        all_image_hog_features = dbconnection.get_object_feature_matrix_from_db(tablename='histogram_of_gradients')
        save_to_pickle(all_image_hog_features,'all_img_features_LSH.pickle')
    #SVD on hog features
    if(read_from_pickle('svd_hog_lsh.pickle')!=None):
        svd_obj = read_from_pickle('svd_hog_lsh.pickle')
        transformed_data = svd_obj['data_matrix']
        vt = svd_obj['vt']
    else:
        svd = SingularValueDecomposition()
        transformed_data,vt = svd.get_transformed_data_copy(all_image_hog_features['data_matrix'],400)
        save_to_pickle({"data_matrix":transformed_data,"images":all_image_hog_features['images'],"vt":vt},'svd_hog_lsh.pickle')

    # index_of_query_image = (all_image_hog_features['images']).index(query_image)
    # image_vector = transformed_data[index_of_query_image]
    bit_map = lsh.generate_representation_for_all_layers(transformed_data,all_image_hog_features['images'])

    save_to_pickle(lsh, 'lsh_model')
    return render(request, 'task5a_output.html')
예제 #3
0
def collect_global_stats():
    path = f'../data/dates/'
    files = [
        map(lambda x: x[2], read_from_pickle(f'{path}{folder}/processed'))
        for folder in os.listdir(path)
    ]

    files_n = [
        map(lambda x: x[3], read_from_pickle(f'{path}{folder}/processed'))
        for folder in os.listdir(path)
    ]

    hoq = HistogramOfQueries('../data/global_stats/hoq')
    hot = HistogramOfTokens('../data/global_stats/hot')
    for query in tqdm(itertools.chain(*files)):
        hoq.add_doc(query)
        hot.add_doc(query)
    hoq.save()
    hot.save()

    hoq = HistogramOfQueries('../data/global_stats/hoq_n')
    hot = HistogramOfTokens('../data/global_stats/hot_n')
    for query in tqdm(itertools.chain(*files_n)):
        hoq.add_doc(query)
        hot.add_doc(query)
    hoq.save()
    hot.save()
예제 #4
0
 def load_data(self):
     print('loading {}-{} features'.format(self.dataset_name,self.cnn_name))
     self.train_data_ids = utils.read_file_to_list(self.train_data_ids_path)
     self.val_data_ids = utils.read_file_to_list(self.val_data_ids_path)
     self.test_data_ids = utils.read_file_to_list(self.test_data_ids_path)
     utils.shuffle_array(self.train_data_ids)
     utils.shuffle_array(self.val_data_ids)
     utils.shuffle_array(self.test_data_ids)
     self.train_data_ids = self.train_data_ids[:1]   # ONLY FOR DEBUG - REMOVE
     self.val_data_ids = self.val_data_ids[:1]
     self.test_data_ids = self.test_data_ids[:1]
     self.train_caps = utils.read_from_json(self.train_caps_path)
     self.val_caps = utils.read_from_json(self.val_caps_path)
     self.test_caps = utils.read_from_json(self.test_caps_path)
     self.vocab = utils.read_from_json(self.vocab_path)
     self.reverse_vocab = utils.read_from_pickle(self.reverse_vocab_path)
     self.vocab_size = len(self.vocab)
     if self.cnn_name in ['ResNet50', 'ResNet152', 'InceptionV3']:
         self.ctx_dim = 2048
     elif self.cnn_name in ['MURALI']:
         self.ctx_dim = 1024
     elif self.cnn_name in ['VGG19']:
         self.ctx_dim = 512
     else:
         raise NotImplementedError()
     self.train_ids = self.get_vid_ids(self.train_data_ids)
     self.val_ids = self.get_vid_ids(self.val_data_ids)
     self.test_ids = self.get_vid_ids(self.test_data_ids)
     self.kf_train = utils.generate_minibatch_idx(len(self.train_data_ids), self.mb_size_train)
     self.kf_val = utils.generate_minibatch_idx(len(self.val_data_ids), self.mb_size_test)   #TODO - verify test or val
     self.kf_test = utils.generate_minibatch_idx(len(self.test_data_ids), self.mb_size_test)
    def get_DTC_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m):
        q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items)
        X_train, Y_train = self.create_X_Y_as_np_matrix(rel_items=rel_items, irl_items=irl_items)

        # Training SVM classifier
        dtl = decision_tree_learning.DecisionTreeLearning()
        dtl.fit(X=X_train, y=Y_train)

        # Now getting more test data from LSH indexes
        test_dataset = read_from_pickle('test_dataset.pickle')
        X_test, imageNames = self.create_X_test_as_np_matrix(test_dataset=test_dataset)
        Y_pred = dtl.predict(u=X_test)
        relevant_pred_img_names = [imageNames[i] for i in range(0, len(Y_pred)) if Y_pred[i] == 1]

        length_relevant_images = len(relevant_pred_img_names)
        if length_relevant_images < m:
            irr_image_names = [imageNames[i] for i in range(0, m - length_relevant_images) if Y_pred[i] == -1]
            relevant_pred_img_names.extend(irr_image_names)

        new_obj_feature_matrix = self.database_connection.HOG_descriptor_from_image_ids(
            image_ids=relevant_pred_img_names)

        new_rank_list = get_most_m_similar_images(data_with_images=new_obj_feature_matrix,
                                                  query_image_feature_vector=q_new, m=m)
        return new_rank_list
예제 #6
0
    def train_model(path, cmodel, get_data_and_query):
        print(f'training model {path}')
        reader = read_from_pickle(f'{path}/processed')
        for coll in iter_by_batch(reader, batch_size):
            unzipped = list(zip(*coll))
            data, _ = get_data_and_query(unzipped)

            online_clustering(data, cmodel)
예제 #7
0
def get_hsv_std_values():
    """
        Import and return stored HSV value object from '/pickle_files/hsv.pickle'. If no such file exists, return HSV
        object with default values.

        :return:
    """
    hsv_std_values = utils.read_from_pickle(HSV_PICKLE_PATH)
    if not hsv_std_values:
        return HSV()
    else:
        return hsv_std_values
    def get_PPR_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m):
        q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items)
        topology_images = read_from_pickle('test_dataset.pickle')
        image_names = get_image_names_from_tuples(topology_images)
        db_conn = DatabaseConnection()
        data_image_dict = db_conn.HOG_descriptor_from_image_ids(image_names)
        data_matrix = data_image_dict['data_matrix']
        image_names = data_image_dict['images']
        svd_obj = SingularValueDecomposition()
        svd_image_data = svd_obj.get_transformed_data(data_matrix, 8)  # change this for 11K images

        pg_obj = PageRank()
        image_similarity_matrix = pg_obj.get_image_similarity_matrix_for_top_k_images(6, svd_image_data)
        seed_vector = pg_obj.get_seed_vector(rel_items, image_names, irl_items)
        pie = pg_obj.get_page_rank_eigen_vector(image_similarity_matrix, seed_vector)
        new_rank_list = pg_obj.get_top_K_images_based_on_scores(pie, image_names, m)

        return new_rank_list
    def __init__(self, cur_dir, dataset_path, executable_path):
        """
        Args:
            cur_dir: Working directory (for lkh3 files)
            dataset_path: Path to graph data
            executable_path: Path to LKH-3 executable (LKH file)
        """

        print('This class was written and tested for Unix systems only')

        self.platform = sys.platform

        self.dir = cur_dir

        print('Creating directory ', self.dir)
        os.makedirs(self.dir, exist_ok=True)

        print('Loading validation dataset ', dataset_path)
        self.val_data = read_from_pickle(dataset_path,
                                         return_tf_data_set=False)

        self.problem_files = []

        self.tour_files = []

        self.params_files = []

        self.executable = executable_path

        self.depot_list = []
        self.loc_list = []
        self.demands_list = []
        self.tour_list = []

        # Params for LKH-3
        self.runs = 1
        self.seed = 1234
예제 #10
0
def process_folders(path, folders, tfidf_dict=None):
    batch_size = 100

    def train_model(path, cmodel, get_data_and_query):
        print(f'training model {path}')
        reader = read_from_pickle(f'{path}/processed')
        for coll in iter_by_batch(reader, batch_size):
            unzipped = list(zip(*coll))
            data, _ = get_data_and_query(unzipped)

            online_clustering(data, cmodel)

    def predict_with_model(path, name, cmodel, get_data_and_query):
        print(f'predicting with {name}')

        create_dir(f'{path}/{name}')

        with open(f'{path}/{name}/label_query_vector',
                  'wb') as label_query_vector:
            reader = read_from_pickle(f'{path}/processed')
            for coll in iter_by_batch(reader, batch_size):
                print(f'iterating ------ {folder}')
                unzipped = list(zip(*coll))
                data, queries = get_data_and_query(unzipped)
                for i in zip(cmodel.predict(data), queries, data):
                    pickle.dump(i, label_query_vector)

        create_dir(f'{path}/cluster_{name}')
        create_dir(f'{path}/cluster_{name}_dump')

        for label, query, vector in read_from_pickle(
                f'{path}/{name}/label_query_vector'):
            with open(f'{path}/cluster_{name}/{label}', 'a') as f:
                f.write(f'{query}\n')

            with open(f'{path}/cluster_{name}_dump/{label}', 'ab') as fh:
                pickle.dump(vector, fh)

        coll = list(read_from_pickle(f'{path}/{name}/label_query_vector'))
        unzipped = list(zip(*coll))
        labels = unzipped[0]
        vectors = unzipped[2]

        with open(f'{path}/{name}/silhouette', 'w') as f:
            f.write(str(silhouette_score(vectors, labels)))

    for folder in folders:
        cmodels = [('w2v', Birch(n_clusters=300), lambda uz:
                    (np.array(uz[0]), uz[2])),
                   ('w2v_n', Birch(n_clusters=300), lambda uz:
                    (np.array(uz[1]), uz[3])),
                   ('tfidf', Birch(n_clusters=300), lambda uz:
                    (get_tfidf_rep(uz[2], tfidf_dict), uz[2])),
                   ('tfidf_n', Birch(n_clusters=300), lambda uz:
                    (get_tfidf_rep(uz[3], tfidf_dict), uz[3]))]

        labels = []

        for name, cmodel, get_data in cmodels:
            train_model(f'{path}{folder}', cmodel, get_data)
            predict_with_model(f'{path}{folder}', name, cmodel, get_data)

            labels.append([
                label for label, _, _ in read_from_pickle(
                    f'{path}{folder}/{name}/label_query_vector')
            ])

        # read labels, then compare
        with open(f'{path}{folder}/cluster_similarity', 'w') as sim_file:
            sim_file.write('w2v/w2v_n ' +
                           str(adjusted_rand_score(labels[0], labels[1])) +
                           '\n')
            sim_file.write('w2v/tfidf ' +
                           str(adjusted_rand_score(labels[0], labels[2])) +
                           '\n')
            sim_file.write('w2v/tfidf_n ' +
                           str(adjusted_rand_score(labels[0], labels[3])) +
                           '\n')
            sim_file.write('w2v_n/tfidf ' +
                           str(adjusted_rand_score(labels[1], labels[2])) +
                           '\n')
            sim_file.write('w2v_n/tfidf_n ' +
                           str(adjusted_rand_score(labels[1], labels[3])) +
                           '\n')
            sim_file.write('tfidf/tfidf_n ' +
                           str(adjusted_rand_score(labels[2], labels[3])) +
                           '\n')
예제 #11
0
def execute_task6(request):
    query_image = request.POST.get('query_image')
    most_similar_images = int(request.POST.get('most_similar_images'))
    query_image_folder_name = request.POST.get('query_image_folder_name')
    relevance_feedback = request.POST.get('relevance_feedback')
    lsh = read_from_pickle('lsh_model')
    db_connection = DatabaseConnection()
    image_vector = db_connection.get_feature_data_for_image(
        'histogram_of_gradients', query_image)
    image_vector = np.asarray(image_vector.flatten())

    if read_from_pickle('all_img_features_LSH.pickle') != None:
        all_image_hog_features = read_from_pickle(
            'all_img_features_LSH.pickle')
    else:
        all_image_hog_features = db_connection.get_object_feature_matrix_from_db(
            tablename='histogram_of_gradients')
        save_to_pickle(all_image_hog_features, 'all_img_features_LSH.pickle')
    #SVD on hog features
    if (read_from_pickle('svd_hog_lsh.pickle') != None):
        svd_obj = read_from_pickle('svd_hog_lsh.pickle')
        transformed_data = svd_obj['data_matrix']
        vt = svd_obj['vt']
    else:
        svd = SingularValueDecomposition()
        transformed_data, vt = svd.get_transformed_data_copy(
            all_image_hog_features['data_matrix'], 400)
        save_to_pickle(
            {
                "data_matrix": transformed_data,
                "images": all_image_hog_features['images'],
                "vt": vt
            }, 'svd_hog_lsh.pickle')

    if (query_image_folder_name != ''):
        table_name = convert_folder_path_to_table_name(
            query_image_folder_name, 'histogram_of_gradients')
        image_vector = db_connection.get_feature_data_for_image(
            table_name, query_image)

    image_vector = np.dot(image_vector.astype(float), np.transpose(vt))

    new_obj = {}
    new_obj['data_matrix'] = transformed_data
    new_obj['images'] = all_image_hog_features['images']
    (sorted_k_values,
     result_stats) = lsh.find_ksimilar_images(k=most_similar_images,
                                              image_vector=image_vector,
                                              all_image_hog_features=new_obj)

    # Now getting a bigger test dataset for relevance feedback
    if relevance_feedback == "Probabilistic":
        (test_dataset, result_stats) = lsh.find_ksimilar_images(
            k=10 + most_similar_images,
            image_vector=image_vector,
            all_image_hog_features=new_obj)
    else:
        (test_dataset, result_stats) = lsh.find_ksimilar_images(
            k=200 + most_similar_images,
            image_vector=image_vector,
            all_image_hog_features=new_obj)

    save_to_pickle(test_dataset, 'test_dataset.pickle')
    print(sorted_k_values[:most_similar_images])
    return render(
        request, 'visualize_images.html', {
            'images': sorted_k_values[:most_similar_images],
            "from_task": "task5",
            'rel_type': relevance_feedback,
            "q": query_image,
            "t": most_similar_images,
            "num_total": result_stats['total'],
            "num_unique": result_stats['unique']
        })
예제 #12
0
def process_feedback(request):
    rf = RelevanceFeedback()
    relevant = request.POST.get("relevant[]")
    irrelevant = request.POST.get("irrelevant[]")
    rel_type = json.loads(request.POST.get("rel_type"))
    m = int(request.POST.get("t"))

    q_name = json.loads(request.POST.get("q"))
    # obj_feature_matrix = rf.database_connection.get_object_feature_matrix_from_db('histogram_of_gradients')
    obj_similar_thousand_names = read_from_pickle('test_dataset.pickle')
    obj_similar_thousand_names = [x[0] for x in obj_similar_thousand_names]
    obj_feature_matrix = rf.database_connection.HOG_descriptor_from_image_ids(
        image_ids=obj_similar_thousand_names)
    data_matrix = obj_feature_matrix['data_matrix']
    new_rank_list = []
    relevant = json.loads(relevant)
    irrelevant = json.loads(irrelevant)
    q = rf.database_connection.get_feature_data_for_image(
        'histogram_of_gradients', q_name)
    # Vt=rf.get_Vt(obj_feature_matrix=obj_feature_matrix)

    if rel_type == 'Probabilistic':
        n_i = rf.calculate_n_i(D_matrix=data_matrix)
        new_rank_list = rf.calculate_feedback_prob_similarity(
            D_matrix=data_matrix,
            images=obj_feature_matrix['images'],
            relevant_items=relevant,
            n_i=n_i)

        new_rank_list = new_rank_list[:m]

    elif rel_type == 'Support Vector Machine':
        new_rank_list = rf.get_SVM_based_feedback(
            q=q,
            rel_items=relevant,
            irl_items=irrelevant,
            obj_feature_matrix=obj_feature_matrix,
            m=m)
        # new_rank_list=rf.get_SVM_based_feedback(q=q,Vt=Vt,rel_items=relevant,irl_items=irrelevant,obj_feature_matrix=obj_feature_matrix,m=m)

    elif rel_type == 'Decision Tree Classifier':
        new_rank_list = rf.get_DTC_based_feedback(
            q=q,
            rel_items=relevant,
            irl_items=irrelevant,
            obj_feature_matrix=obj_feature_matrix,
            m=m)

    elif rel_type == 'Personalized Page Rank':
        new_rank_list = rf.get_PPR_based_feedback(
            q=q,
            rel_items=relevant,
            irl_items=irrelevant,
            obj_feature_matrix=obj_feature_matrix,
            m=m)
    else:
        new_rank_list.append((
            'Please select a relevance feedback type and start again from task 5',
            '0'))

    return render(
        request, 'visualize_images.html', {
            'images': new_rank_list,
            "from_task": "task6",
            "rel_type": rel_type,
            "q": q_name,
            "t": m
        })