Exemplo n.º 1
0
def create_flat_images_db(start=0, end=100, dim=(100, 100)):
    """For each image in path, resizes it to the given dim, transforms it into a flat vector
    and stores in a df. Then dumps it into a joblib file.
    Start and end define which images to consider."""

    if len(os.listdir(IMAGES_PATH)) < start:
        return None
    elif len(os.listdir(IMAGES_PATH)) < end:
        end = len(os.listdir(IMAGES_PATH))
    filelist = sorted(os.listdir(IMAGES_PATH),
                      key=lambda x: int(x.strip('.jpg')))[start:end]

    # stores flat images in a dataframe
    img_db = pd.DataFrame()
    print("Creating joblib database...")
    for filename in tqdm(filelist,
                         bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        img = preprocess_image(os.path.join(IMAGES_PATH, filename), dim=dim)
        img_db = img_db.append(
            pd.DataFrame(img, index=[int(filename.strip('.jpg'))]))

    # save df to joblib file
    path = join(dirname(__file__), '..', 'raw_data', 'test_images_db_2',
                'flat_resized_images')
    img_db.sort_index(inplace=True)
    joblib.dump(img_db, '_'.join([path, str(start), str(end - 1)]) + '.joblib')
    print(
        f"=> Created file {'_'.join([path,str(start),str(end-1)])+'.joblib'}")
Exemplo n.º 2
0
def create_flat_images_db(size=100, path=IMAGES_PATH, dim=(36,42)):
    """For each image in path, resizes it to the given dim, transforms it into a flat vector
    and stores in a df. Then dumps it into a joblib file"""

    # stores flat images in a dataframe
    img_db = pd.DataFrame()
    print("\nCreating joblib database...")
    for filename in tqdm(os.listdir(IMAGES_PATH)[:size], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        img = preprocess_image(os.path.join(IMAGES_PATH, filename),dim=dim)
        img_db = img_db.append(pd.DataFrame(img,index=[filename.strip('.jpg')]))

    # save df to joblib file
    img_db.sort_index(inplace=True)
    if size:
        joblib.dump(img_db,FLAT_IMAGES_DB_PATH_ROOT+'_'+str(size)+'.joblib')
    else:
        joblib.dump(img_db,FLAT_IMAGES_DB_PATH_ROOT+'.joblib')
Exemplo n.º 3
0
    client = storage.Client()

    dataset_filename = 'catalog.csv'
    path = f"gs://{BUCKET_NAME}/{BUCKET_INITIAL_DATASET_FOLDER}/{dataset_filename}"

    df = pd.read_csv(path, encoding='unicode_escape')
    df['URL'] = df['URL'].map(get_jpg_link)

    urls = [df.iloc[i]['URL'] for i in indexes]

    if not all_info:
        return urls

    # get additional info
    titles = [df.iloc[i]['TITLE'] for i in indexes]
    authors = [df.iloc[i]['AUTHOR'] for i in indexes]

    return urls, titles, authors


if __name__ == '__main__':
    from vincentvanbot.preprocessing import preprocess_image
    path = os.path.join(os.path.dirname(__file__), '..', '..', 'notebooks',
                        'example-input.jpg')
    # print(path)
    user_img = preprocess_image(path, dim=(36, 42))
    indexes = get_closest_images_indexes(user_img)
    urls = get_info_from_index(indexes)
    print(urls)
'''Chnage path to local here'''
Exemplo n.º 4
0
                                                     max_results=50,
                                                     proba_threshold=0.5,
                                                     manual=False).keys()]
    '''sum eahch row for the resulting dataframe (=KNN Dataframe with only mathcing labels to the input data)'''
    KNN_index['SUM'] = KNN_index.sum(axis=1)
    KNN_index = KNN_index.sort_values(by=['SUM'], ascending=False)[:3]

    return list(KNN_index.index.values)


if __name__ == '__main__':
    from vincentvanbot.preprocessing import preprocess_image
    path = os.path.join(os.path.dirname(__file__), '..', 'notebooks',
                        'example-input.jpg')
    #get_labels_from_local_path(path, max_results = 10 , proba_threshold=0.5, manual=False).keys()
    user_img = preprocess_image(path, dim=(100, 100))
    print(get_info_from_index(filter_KNN_results(user_img, path)))

    # from vincentvanbot.data import get_data_locally
    # df_total = get_data_locally(100)

    # import numpy as np
    # df_list = []
    # for i, df in enumerate(np.array_split(df_total,10)):
    #     print(f"\nWorking on slice {i+1}")
    #     labels_df = get_labels_df(df,100_000, source='local', manual=False)
    #     df_list.append(labels_df)

    # import pandas as pd
    # labels_df_total = pd.concat(df_list,axis=0)
    # labels_df_total.fillna(0, inplace= True)
Exemplo n.º 5
0
def process_user_file(file, n_similar=3):

    response = []

    # Getting a file from a user
    contents = file.read()
    temp_file_name = str(time.time())
    with open(temp_file_name, 'wb') as user_file:
        user_file.write(contents)

    try:
        # Labeling a user pic
        labels_dict = get_labels_from_local_path(temp_file_name,
                                                 50,
                                                 0.5,
                                                 manual=False)
        labels_list = [
            label for label in labels_dict.keys()
            if label in labeled_dataframe.columns
        ]

        # Sequence of bytes/file is passed to preprocessor {preproc.transform(X)}. OUTPUT: Vector of size matching the KNN model.
        image_processed = preprocess_image(temp_file_name)

        # n_similar closest neighbors are returned {knn_model} = their indice
        index_neighbors = knn_model.kneighbors(image_processed,
                                               n_neighbors=n_similar * 2)[1][0]
        base_indices = [int(train_indices[i]) for i in list(index_neighbors)]

        # Getting the knn rank
        labeled_knn_filtered = labeled_dataframe.loc[base_indices, :]
        labeled_knn_filtered['KNN_RANK'] = labeled_knn_filtered.reset_index(
        ).index.values + 1
        indices_with_knn_rank = labeled_knn_filtered[['KNN_RANK']]

        # Getting the labels rank
        labeled_knn_filtered = labeled_knn_filtered[labels_list]
        labeled_knn_filtered['SUM'] = labeled_knn_filtered.sum(axis=1)
        labeled_knn_filtered = labeled_knn_filtered.sort_values(
            by=['SUM'], ascending=False)
        labeled_knn_filtered['LABELS_RANK'] = labeled_knn_filtered.reset_index(
        ).index.values + 1
        indices_with_labels_rank = labeled_knn_filtered[['LABELS_RANK']]

        # Calculating the combined rank
        knn_with_labels = indices_with_knn_rank.join(indices_with_labels_rank,
                                                     how='inner')
        knn_with_labels['RANK_SUM'] = knn_with_labels[
            'KNN_RANK'] + knn_with_labels['LABELS_RANK']
        knn_with_labels = knn_with_labels.sort_values(by=['RANK_SUM'],
                                                      ascending=True)

        results = list(knn_with_labels.index.values)[:n_similar]

        # Send response back. Response structure: [ n_similar pieces of {img_url, html_url, author, title, created, museum}]
        for ind in results:
            response_item = dict (img_url = get_jpg_link(database.at[ind, 'URL']), html_url = database.at[ind, 'URL'], \
                author = database.at[ind, 'AUTHOR'], title = database.at[ind, 'TITLE'], \
                created = database.at[ind, 'DATE'],  museum =  database.at[ind, 'LOCATION'])

            response.append(response_item)

    except BaseException as e:
        print(e)
    finally:
        # Deleting user pic
        os.remove(temp_file_name)

    return response