We use this model to generate **embeddings** for our images. As you can see below, once we've used the model to generate image features, we can then **store them to disk** and re-use them without needing to do inference again! This is one of the reason that embeddings are so popular in practical applications, as they allow for huge efficiency gains. """) with st.echo(): model = vector_search.load_headless_pretrained_model() if generate_image_features: print ("Generating image features...") images_features, file_index = vector_search.generate_features(image_paths, model) vector_search.save_features(features_path, images_features, file_mapping_path, file_index) else: images_features, file_index = vector_search.load_features(features_path, file_mapping_path) st.write("Our model is simply VGG16 without the last layer (softmax)") st.image(Image.open('assets/vgg16_architecture.jpg'), width=800, caption="Original VGG. Credit to Data Wow Blog") st.image(Image.open('assets/vgg16_chopped.jpg'), width=800, caption="Our model") st.write("This is how we get such a model in practice") show_source(vector_search.load_headless_pretrained_model) st.write(""" What do we mean by generating embeddings? Well we just use our pre-trained model up to the penultimate layer, and store the value of the activations.""") show_source(vector_search.generate_features) st.write('Here are what the embeddings look like for the first 20 images. Each image is now represented by a ' 'sparse vector of size 4096:') st.write(images_features[:20])
# Error in save_features for file_mapping, so brute force save np.save('./object_detection/feature_extraction/ref_img_features_no_aug.npy', features) with open( './object_detection/feature_extraction/ref_img_filemapping_no_aug.json', 'w') as f: json.dump(file_mapping, f) # %% Build database reference features ''' Build and save an ANNOY search index for fast searching. ''' # Load the saved features and file mappings and compile Annoy search indices features, file_index = vector_search.load_features( 'object_detection/feature_extraction/ref_img_features_no_aug', './object_detection/feature_extraction/ref_img_filemapping_no_aug') ref_img_index = vector_search.index_features(features, n_trees=8000) ref_img_index.save( './object_detection/feature_extraction/ref_img_index_no_aug.ann') # %% ''' Get and save hash dictionary mapping between the file_mapping index, and each the item name, category, source website, local source file path, and the local source file name. ''' with open(
indexing, pure_image_embedding = check_inputs(index_folder, input_image, input_word, model_path, glove_path) # Decide whether to use pre-trained VGG or custom model loaded_model = vector_search.load_headless_pretrained_model() if model_path: loaded_model = load_model(model_path) # Decide whether to index or search if indexing: features, index = index_images(index_folder, features_path, file_mapping, loaded_model) print("Indexed %s images" % len(features)) else: images_features, file_index = vector_search.load_features( features_path, file_mapping) # Decide whether to do only image search or hybrid search if pure_image_embedding: image_index = vector_search.index_features(images_features, dims=4096) search_key = get_index(input_image, file_index) results = vector_search.search_index_by_key( search_key, image_index, file_index) print(results) else: word_vectors = vector_search.load_glove_vectors(glove_path) # If we are searching for tags for an image if input_image: search_key = get_index(input_image, file_index) word_index, word_mapping = vector_search.build_word_index(
def run(item, city, thedir, site, input_file=False, outdir='myflask/static/matches/', first=False, features_only=False, sold=False, model=False, pretrained_exists=False, topn=12): """ Puts everything together: loads the model, loads the features, applies cosine similarity and returns the matching 10 items Args: item: The item you want to search for (e.g., couch) city: The city where you are searching thedir: the primary directory (defined early and passes around for easily porting all programs elsewhere (e.g., AWS) Returns: Nothing, but copies matching items to a temporary directory """ K.clear_session() tf.compat.v1.reset_default_graph() tf.reset_default_graph() tf.keras.backend.clear_session() #item='couch' #city='los_angeles' #thedir='/Users/bsalmon/BrettSalmon/data_science/Insight/goodriddance/scraping/offerup/' if not sold: folder = (thedir + city + '/' + item + '_images/') features_path = (thedir + city + '/cnn/' + item + '_features/') file_mapping_path = (thedir + city + '/cnn/' + item + '_file_mapping/') if not os.path.exists( features_path.replace(features_path.split('/')[-2] + '/', '')): os.mkdir( features_path.replace(features_path.split('/')[-2] + '/', '')) if not os.path.exists(features_path): os.mkdir(features_path) if not os.path.exists(file_mapping_path): os.mkdir(file_mapping_path) else: folder = (thedir + city + '/' + item + '_images/sold/') features_path = (thedir + city + '/cnn/sold_' + item + '_features/') file_mapping_path = (thedir + city + '/cnn/sold_' + item + '_file_mapping/') if not os.path.exists(features_path): os.mkdir(features_path) if not os.path.exists(file_mapping_path): os.mkdir(file_mapping_path) model = load_headless_pretrained_model(pretrained_exists=pretrained_exists) # I'll load all images into memory because it's not that many #images=np.load(features_path+'images.npy') #image_paths=np.load(features_path+'image_paths.npy') if first or features_only: if item == 'couch': plural = 'es' else: plural = 's' print("%% You are generating the image features for all " + item + plural + " from " + city) images, image_paths = load_images(folder) #np.save(features_path+'images',images) #np.save(features_path+'image_paths',np.array(image_paths)) images_features, file_index = generate_features(image_paths, model) vector_search.save_features(features_path, images_features, file_mapping_path, file_index) if features_only: return else: print( "%% You already have the image features in hand-- loading them from disk." ) images_features, file_index = vector_search.load_features( features_path, file_mapping_path) #images=np.load(features_path+'images.npy') #image_paths=np.load(features_path+'image_paths.npy') # Define the location of the file uploaded by the user if not input_file: tfiles = os.listdir('myflask/static/uploads/') for ifile in tfiles: if ifile.endswith(".jpg"): input_file = 'myflask/static/uploads/' + ifile K.clear_session() tf.compat.v1.reset_default_graph() tf.reset_default_graph() tf.keras.backend.clear_session() model = load_headless_pretrained_model(pretrained_exists=pretrained_exists) # Load in the single input image from the user img = image.load_img(input_file, target_size=(224, 224)) x_raw = image.img_to_array(img) x_expand = np.expand_dims(x_raw, axis=0) # Extract the image features according to the headless model singleinput = preprocess_input(x_expand) single_image_features = model.predict(singleinput) # Apply cosine_similarities between features of loaded image # and features of all directory images print("%% That was fast! Applying cosine similarity and finding images") cosine_similarities = (cosine_similarity(single_image_features, images_features)[0]) # Get top N similar image ID numbers top_N_idx = (np.argsort(cosine_similarities)[-topn:])[::-1] # Get top 10 similar image files topfiles = [file_index[i] for i in top_N_idx] # Move them to a happy static folder barefiles = [] match_ids = [] for i in range(len(topfiles)): copyfile(topfiles[i], outdir + site + '/' + topfiles[i].split('/')[-1]) barefiles.append(topfiles[i].split('/')[-1]) match_ids.append(int(topfiles[i].split('/')[-1].replace('.jpg', ''))) #After prediction K.clear_session() print("%% Cosine similarity complete. Matched " + "images are in myflask/static/matches/" + site) return barefiles, match_ids, (cosine_similarities[top_N_idx])