def create_flat_images_db(start=0, end=100, dim=(100, 100)): """For each image in path, resizes it to the given dim, transforms it into a flat vector and stores in a df. Then dumps it into a joblib file. Start and end define which images to consider.""" if len(os.listdir(IMAGES_PATH)) < start: return None elif len(os.listdir(IMAGES_PATH)) < end: end = len(os.listdir(IMAGES_PATH)) filelist = sorted(os.listdir(IMAGES_PATH), key=lambda x: int(x.strip('.jpg')))[start:end] # stores flat images in a dataframe img_db = pd.DataFrame() print("Creating joblib database...") for filename in tqdm(filelist, bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'): img = preprocess_image(os.path.join(IMAGES_PATH, filename), dim=dim) img_db = img_db.append( pd.DataFrame(img, index=[int(filename.strip('.jpg'))])) # save df to joblib file path = join(dirname(__file__), '..', 'raw_data', 'test_images_db_2', 'flat_resized_images') img_db.sort_index(inplace=True) joblib.dump(img_db, '_'.join([path, str(start), str(end - 1)]) + '.joblib') print( f"=> Created file {'_'.join([path,str(start),str(end-1)])+'.joblib'}")
def create_flat_images_db(size=100, path=IMAGES_PATH, dim=(36,42)): """For each image in path, resizes it to the given dim, transforms it into a flat vector and stores in a df. Then dumps it into a joblib file""" # stores flat images in a dataframe img_db = pd.DataFrame() print("\nCreating joblib database...") for filename in tqdm(os.listdir(IMAGES_PATH)[:size], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'): img = preprocess_image(os.path.join(IMAGES_PATH, filename),dim=dim) img_db = img_db.append(pd.DataFrame(img,index=[filename.strip('.jpg')])) # save df to joblib file img_db.sort_index(inplace=True) if size: joblib.dump(img_db,FLAT_IMAGES_DB_PATH_ROOT+'_'+str(size)+'.joblib') else: joblib.dump(img_db,FLAT_IMAGES_DB_PATH_ROOT+'.joblib')
client = storage.Client() dataset_filename = 'catalog.csv' path = f"gs://{BUCKET_NAME}/{BUCKET_INITIAL_DATASET_FOLDER}/{dataset_filename}" df = pd.read_csv(path, encoding='unicode_escape') df['URL'] = df['URL'].map(get_jpg_link) urls = [df.iloc[i]['URL'] for i in indexes] if not all_info: return urls # get additional info titles = [df.iloc[i]['TITLE'] for i in indexes] authors = [df.iloc[i]['AUTHOR'] for i in indexes] return urls, titles, authors if __name__ == '__main__': from vincentvanbot.preprocessing import preprocess_image path = os.path.join(os.path.dirname(__file__), '..', '..', 'notebooks', 'example-input.jpg') # print(path) user_img = preprocess_image(path, dim=(36, 42)) indexes = get_closest_images_indexes(user_img) urls = get_info_from_index(indexes) print(urls) '''Chnage path to local here'''
max_results=50, proba_threshold=0.5, manual=False).keys()] '''sum eahch row for the resulting dataframe (=KNN Dataframe with only mathcing labels to the input data)''' KNN_index['SUM'] = KNN_index.sum(axis=1) KNN_index = KNN_index.sort_values(by=['SUM'], ascending=False)[:3] return list(KNN_index.index.values) if __name__ == '__main__': from vincentvanbot.preprocessing import preprocess_image path = os.path.join(os.path.dirname(__file__), '..', 'notebooks', 'example-input.jpg') #get_labels_from_local_path(path, max_results = 10 , proba_threshold=0.5, manual=False).keys() user_img = preprocess_image(path, dim=(100, 100)) print(get_info_from_index(filter_KNN_results(user_img, path))) # from vincentvanbot.data import get_data_locally # df_total = get_data_locally(100) # import numpy as np # df_list = [] # for i, df in enumerate(np.array_split(df_total,10)): # print(f"\nWorking on slice {i+1}") # labels_df = get_labels_df(df,100_000, source='local', manual=False) # df_list.append(labels_df) # import pandas as pd # labels_df_total = pd.concat(df_list,axis=0) # labels_df_total.fillna(0, inplace= True)
def process_user_file(file, n_similar=3): response = [] # Getting a file from a user contents = file.read() temp_file_name = str(time.time()) with open(temp_file_name, 'wb') as user_file: user_file.write(contents) try: # Labeling a user pic labels_dict = get_labels_from_local_path(temp_file_name, 50, 0.5, manual=False) labels_list = [ label for label in labels_dict.keys() if label in labeled_dataframe.columns ] # Sequence of bytes/file is passed to preprocessor {preproc.transform(X)}. OUTPUT: Vector of size matching the KNN model. image_processed = preprocess_image(temp_file_name) # n_similar closest neighbors are returned {knn_model} = their indice index_neighbors = knn_model.kneighbors(image_processed, n_neighbors=n_similar * 2)[1][0] base_indices = [int(train_indices[i]) for i in list(index_neighbors)] # Getting the knn rank labeled_knn_filtered = labeled_dataframe.loc[base_indices, :] labeled_knn_filtered['KNN_RANK'] = labeled_knn_filtered.reset_index( ).index.values + 1 indices_with_knn_rank = labeled_knn_filtered[['KNN_RANK']] # Getting the labels rank labeled_knn_filtered = labeled_knn_filtered[labels_list] labeled_knn_filtered['SUM'] = labeled_knn_filtered.sum(axis=1) labeled_knn_filtered = labeled_knn_filtered.sort_values( by=['SUM'], ascending=False) labeled_knn_filtered['LABELS_RANK'] = labeled_knn_filtered.reset_index( ).index.values + 1 indices_with_labels_rank = labeled_knn_filtered[['LABELS_RANK']] # Calculating the combined rank knn_with_labels = indices_with_knn_rank.join(indices_with_labels_rank, how='inner') knn_with_labels['RANK_SUM'] = knn_with_labels[ 'KNN_RANK'] + knn_with_labels['LABELS_RANK'] knn_with_labels = knn_with_labels.sort_values(by=['RANK_SUM'], ascending=True) results = list(knn_with_labels.index.values)[:n_similar] # Send response back. Response structure: [ n_similar pieces of {img_url, html_url, author, title, created, museum}] for ind in results: response_item = dict (img_url = get_jpg_link(database.at[ind, 'URL']), html_url = database.at[ind, 'URL'], \ author = database.at[ind, 'AUTHOR'], title = database.at[ind, 'TITLE'], \ created = database.at[ind, 'DATE'], museum = database.at[ind, 'LOCATION']) response.append(response_item) except BaseException as e: print(e) finally: # Deleting user pic os.remove(temp_file_name) return response