def knn_alt_gender_classifier(target_prod_embedding, k_nbors): # k closest male reference product(s): male_sim_to_target_product = [] for ref_embed_m in male_reference_embeddings: male_sim_to_target_product += [ cosine_sim(target_prod_embedding, ref_embed_m) ] k_closest_ref_male_indices = np.argpartition(male_sim_to_target_product, -k_nbors)[-k_nbors:] # k closest female reference product(s): female_sim_to_target_product = [] for ref_embed_f in female_reference_embeddings: female_sim_to_target_product += [ cosine_sim(target_prod_embedding, ref_embed_f) ] k_closest_ref_female_indices = np.argpartition( female_sim_to_target_product, -k_nbors)[-k_nbors:] avg_cosine_sim_to_k_closest_male_ref_prods = np.mean( [male_sim_to_target_product[i] for i in k_closest_ref_male_indices]) avg_cosine_sim_to_k_closest_female_ref_prods = np.mean([ female_sim_to_target_product[i] for i in k_closest_ref_female_indices ]) odds_target_prod_is_female = avg_cosine_sim_to_k_closest_female_ref_prods / avg_cosine_sim_to_k_closest_male_ref_prods prob_target_prod_is_female = odds_target_prod_is_female / ( 1 + odds_target_prod_is_female) return prob_target_prod_is_female
def knn_gender_classifier(target_prod_embedding, k_nbors, sim_weighting=True, return_female_prob_only=False): start_time = timeit.default_timer() # k closest products: sim_to_target_product = [] for ref_embed_b in combined_reference_embeddings: sim_to_target_product += [ cosine_sim(target_prod_img_embedding, ref_embed_b) ] k_closest_ref_indices = np.argpartition(sim_to_target_product, -k_nbors)[-k_nbors:] k_closest_ref_true_gender_labels = [ combined_reference_gender_labels[x] for x in k_closest_ref_indices ] if sim_weighting: k_closest_ref_cosine_sim_values = np.array( [sim_to_target_product[x] for x in k_closest_ref_indices]) min_sim_value = k_closest_ref_cosine_sim_values.min() neighbour_weights = ( k_closest_ref_cosine_sim_values - min_sim_value) / sum( (k_closest_ref_cosine_sim_values - min_sim_value)) male_density = 0.0 female_density = 0.0 for i in range(len(k_closest_ref_true_gender_labels)): if k_closest_ref_true_gender_labels[i] == "male": male_density += neighbour_weights[i] else: female_density += neighbour_weights[i] female_prob = female_density / (female_density + male_density) male_prob = 1 - female_prob else: true_label_counts = np.unique(k_closest_ref_true_gender_labels, return_counts=True) if len(true_label_counts[0] ) == 1: # if there is only 1 unique gender label present if true_label_counts[0] == "male": female_prob = 0.0 else: female_prob = 1.0 else: female_counts = true_label_counts[1][0] male_counts = true_label_counts[1][1] female_prob = female_counts / (female_counts + male_counts) male_prob = 1 - female_prob if female_prob > male_prob: final_label = "FEMALE" else: final_label = "MALE" if return_female_prob_only: return female_prob else: print( f"k={k_nbors} Pr[Female]={female_prob:.2f} time_elapsed: {timeit.default_timer()-start_time}" )
embeddings = [np.load(f"{male_reference_img_embeddings_dir}{i}") for i in os.listdir(male_reference_img_embeddings_dir)] + [np.load(f"{female_reference_img_embeddings_dir}{i}") for i in os.listdir(female_reference_img_embeddings_dir)] male_density = [] k_neighbours = 200 #for j in tqdm( range(len(prod_IDs)) ): j = np.random.choice( range(len(prod_IDs)) ) from PIL import Image try: display( Image.open(f"{male_reference_img_dir}{prod_IDs[j]}.jpg") ) except: display( Image.open(f"{female_reference_img_dir}{prod_IDs[j]}.jpg") ) target_embedding = embeddings[j] sim_to_target = [] for ref_embed_m in embeddings: sim_to_target += [ cosine_sim(target_embedding, ref_embed_m) ] k_closest_ref_indices = np.argpartition(sim_to_target, -k_neighbours)[-k_neighbours:] true_gender_label = gender_labels[j] proportion_of_neighbours_male = np.array( [ gender_labels[i] for i in k_closest_ref_indices ] ).sum() / k_neighbours if true_gender_label==1 and proportion_of_neighbours_male<0.5: male_density += [ 0 ] elif true_gender_label==0 and proportion_of_neighbours_male>0.5: male_density += [ 0 ] elif true_gender_label==1 and proportion_of_neighbours_male>=0.5: male_density += [ proportion_of_neighbours_male - 0.5 ] elif true_gender_label==0 and proportion_of_neighbours_male<=0.5: male_density += [ proportion_of_neighbours_male -0.5 ] else: print( f"FAILED: j={j}" ) print( f"{true_gender_label}\tmale density: {male_density[len(male_density)-1]}" )
def find_closest_k_products( target_product_ID, k, embedding_weights # weights don't need to sum to 1 (are normalised to sum to 1 by the function) , limit_to_same_category=False, limit_to_same_gender=False): """ ## EXAMPLE USAGE ## find_closest_k_products( target_product_ID = "060601ABCG5" , k = 6 , embedding_weights = { "global_image_ResNet50":2 , "local_image_ResNet50":2 , "text_tf_idf":1 , "text_spacy_title":0.0 , "text_spacy_title_hierarchy":0.0 , "text_spacy_title_hierarchy_copy":0.0 } ) """ sum_embedding_weights = sum(embedding_weights.values()) for key_i in embedding_weights: embedding_weights[ key_i] = embedding_weights[key_i] / sum_embedding_weights print("normalised embeddings weights:") for key_i in embedding_weights: print(f"\t{key_i}: {embedding_weights[key_i]}") target_product_info = product_database_dict[target_product_ID] target_product_embeddings = { "global_image_ResNet50": target_product_info["embeddings"]["img_ResNet50"]["global"], "local_image_ResNet50": target_product_info["embeddings"]["img_ResNet50"]["local"], "text_tf_idf": target_product_info["embeddings"]["text"]["tf_idf"], "text_spacy_title": target_product_info["embeddings"]["text"]["spacy_title"], "text_spacy_title_hierarchy": target_product_info["embeddings"]["text"]["spacy_title_hierarchy"], "text_spacy_title_hierarchy_copy": target_product_info["embeddings"]["text"]["spacy_title_hierarchy_copy"] } # reduce set of candidate products using product category and gender filters: if limit_to_same_category == False and limit_to_same_gender == False: eligible_product_IDs = [ i for i in product_database_dict if i != target_product_ID ] elif limit_to_same_category == True and limit_to_same_gender == False: target_product_categories = target_product_info["predicted_category"][ "final_label(s)"] eligible_product_IDs = [ i for i in product_database_dict if i != target_product_ID and len( set(product_database_dict[i]["predicted_category"] ["final_label(s)"]).intersection( set(target_product_categories))) > 0 ] elif limit_to_same_category == False and limit_to_same_gender == True: target_product_gender = target_product_info[ "predicted_product_gender"]["final_label(s)"] eligible_product_IDs = [ i for i in product_database_dict if i != target_product_ID and product_database_dict[i]["predicted_product_gender"] ["final_label(s)"] == target_product_gender ] else: # i.e. limit_to_same_category==True and limit_to_same_gender==True target_product_categories = target_product_info["predicted_category"][ "final_label(s)"] target_product_gender = target_product_info[ "predicted_product_gender"]["final_label(s)"] eligible_product_IDs = [ i for i in product_database_dict if i != target_product_ID and product_database_dict[i]["predicted_product_gender"] ["final_label(s)"] == target_product_gender and len( set(product_database_dict[i]["predicted_category"] ["final_label(s)"]).intersection( set(target_product_categories))) > 0 ] similarity_to_target_prod = [] for prod_id_i in tqdm(eligible_product_IDs): similarity_to_target_prod += [ embedding_weights["global_image_ResNet50"] * cosine_sim( target_product_embeddings["global_image_ResNet50"], product_database_dict[prod_id_i]["embeddings"]["img_ResNet50"] ["global"]) + embedding_weights["local_image_ResNet50"] * cosine_sim( target_product_embeddings["local_image_ResNet50"], product_database_dict[prod_id_i]["embeddings"]["img_ResNet50"] ["local"]) + embedding_weights["text_tf_idf"] * cosine_sim( target_product_embeddings["text_tf_idf"], product_database_dict[prod_id_i]["embeddings"]["text"] ["tf_idf"]) + embedding_weights["text_spacy_title"] * cosine_sim( target_product_embeddings["text_spacy_title"], product_database_dict[prod_id_i]["embeddings"]["text"] ["spacy_title"]) + embedding_weights["text_spacy_title_hierarchy"] * cosine_sim( target_product_embeddings["text_spacy_title_hierarchy"], product_database_dict[prod_id_i]["embeddings"]["text"] ["spacy_title_hierarchy"]) + embedding_weights["text_spacy_title_hierarchy_copy"] * cosine_sim( target_product_embeddings["text_spacy_title_hierarchy_copy"], product_database_dict[prod_id_i]["embeddings"]["text"] ["spacy_title_hierarchy_copy"]) ] x = np.array(similarity_to_target_prod) idx = np.argpartition(x, -k)[-k:] # Indices not sorted top_k_indices = idx[np.argsort( x[idx])][::-1] # Indices sorted by value from largest to smallest top_k_closest_product_IDs = [ eligible_product_IDs[i] for i in top_k_indices ] return top_k_closest_product_IDs
def generate_CTL_outfits(target_product_ID, desired_n_outfit_recommendations, limit_to_same_category=True, limit_to_same_gender=True #, limit_to_complete_outfits = True # TODO ): """ ALGORITHM: 1. for target image, find closest {desired_n_outfit_recommendations} matching individual products from the reference collages (optionally limiting potential matches to same product category/gender/only-complete-outfits) (each matching individual product must be from a different collage/collection) for each matching collage identified in (1) above: i. find a matching local product for every item in the matching collage (optionally limiting potential matches to same product category/gender/only-complete-outfits) ## EXAMPLE USAGE ## generate_CTL_outfits( target_product_ID = np.random.choice( list(product_database_dict.keys()) ) ; Image.open( f"{global_images_dir}{target_product_ID}.png" ) , desired_n_outfit_recommendations = 4 , limit_to_same_category = True , limit_to_same_gender = True ) """ #CTL_individ_prod_IDs_in_collage = [] #CTL_individ_prod_collage_IDs = [] #CTL_individ_prod_categs = [] #CTL_individ_prod_img_embeddings = [] target_product_image_embedding = product_database_dict[target_product_ID][ "embeddings"]["img_ResNet50"]["local"] target_product_categories = product_database_dict[target_product_ID][ "predicted_category"]["final_label(s)"] target_product_gender = product_database_dict[target_product_ID][ "predicted_product_gender"]["final_label(s)"] # limit matches to same category: if limit_to_same_category: allowed_CTL_prod_categories = [] for target_categ_i in target_product_categories: allowed_CTL_prod_categories += link_local_prod_categ_to_CTL_categ[ target_categ_i] allowed_CTL_match_indices = [ i for i in range(len(CTL_individ_prod_categs)) if CTL_individ_prod_categs[i] in allowed_CTL_prod_categories ] else: allowed_CTL_match_indices = list(range(len(CTL_individ_prod_categs))) # calculate distance of target product to all individual CTL products: similarity_to_target_prod = [ 0.0 for i in range(len(CTL_individ_prod_categs)) ] for potential_match_index_i in tqdm(allowed_CTL_match_indices): similarity_to_target_prod[potential_match_index_i] = cosine_sim( target_product_image_embedding, CTL_individ_prod_img_embeddings[potential_match_index_i]) # fetch top [desired_n_outfit_recommendations] collages: similarity_to_target_prod = np.array(similarity_to_target_prod) k = desired_n_outfit_recommendations * 10 idx = np.argpartition(similarity_to_target_prod, -k)[-k:] # Indices not sorted top_k_index = idx[np.argsort( similarity_to_target_prod[idx] )][::-1] # Indices sorted by value from largest to smallest top_k_collage_IDs = list( np.array(CTL_individ_prod_collage_IDs)[top_k_index]) top_k_prod_id_in_collage = list( np.array(CTL_individ_prod_IDs_in_collage)[top_k_index]) collage_matches = [] prod_id_in_collage_matches = { } # for remembering which product in the collage matched to the initial target product i = 0 while len(collage_matches) < desired_n_outfit_recommendations: if top_k_collage_IDs[i] not in collage_matches: collage_matches += [top_k_collage_IDs[i]] prod_id_in_collage_matches[ top_k_collage_IDs[i]] = top_k_prod_id_in_collage[i] i += 1 # check that this worked properly: # Image.open( f"{global_images_dir}{target_product_ID}.png" ) # Image.open( f"C://Users//jbolton//Documents//naughty//complete_the_look//collage_images//{collage_matches[0]}.png" ) # Image.open( f"C://Users//jbolton//Documents//naughty//complete_the_look//collage_images//{collage_matches[1]}.png" ) # Image.open( f"C://Users//jbolton//Documents//naughty//complete_the_look//collage_images//{collage_matches[2]}.png" ) # fetch image embeddings of individual products within each collage: collage_individ_prod_img_embeddings = {} for collage_i in collage_matches: individ_prod_indices = [ i for i in range(len(CTL_individ_prod_collage_IDs)) if CTL_individ_prod_collage_IDs[i] == collage_i ] collage_individ_prod_img_embeddings[collage_i] = { "ctl_prod_index": individ_prod_indices, "ctl_prod_ID_in_collage": [CTL_individ_prod_IDs_in_collage[d] for d in individ_prod_indices], "ctl_prod_category": [CTL_individ_prod_categs[c] for c in individ_prod_indices], "ctl_img_embedding": [CTL_individ_prod_img_embeddings[k] for k in individ_prod_indices] } recommended_ctl_outfits = {} for collage_j in collage_individ_prod_img_embeddings: allprods_info = collage_individ_prod_img_embeddings[collage_j] individ_prod_embeddings = allprods_info["ctl_img_embedding"] recommended_prods = [] recommended_prods += [ target_product_ID ] # first recommended product is the original target product itself for i in range(len(allprods_info["ctl_prod_index"])): try: if allprods_info["ctl_prod_ID_in_collage"][ i] == prod_id_in_collage_matches[ collage_j]: # if this is the CTL product which originally matched the target image we are generating CTL recs for pass else: if limit_to_same_category and limit_to_same_gender: prod_i_CTL_category = allprods_info[ "ctl_prod_category"][i] allowed_local_categories = link_CTL_categ_to_local_prod_categ[ prod_i_CTL_category] potential_match_local_prod_IDs = [ l for l in prod_id_list if product_database_dict[l] ["predicted_product_gender"]["final_label(s)"] == target_product_gender and len( set(product_database_dict[l] ["predicted_category"] ["final_label(s)"]).intersection( set(allowed_local_categories))) > 0 ] elif limit_to_same_category: prod_i_CTL_category = allprods_info[ "ctl_prod_category"][i] allowed_local_categories = link_CTL_categ_to_local_prod_categ[ prod_i_CTL_category] potential_match_local_prod_IDs = [ l for l in prod_id_list if len( set(product_database_dict[l] ["predicted_category"] ["final_label(s)"]).intersection( set(allowed_local_categories))) > 0 ] elif limit_to_same_gender: prod_i_CTL_category = allprods_info[ "ctl_prod_category"][i] allowed_local_categories = link_CTL_categ_to_local_prod_categ[ prod_i_CTL_category] potential_match_local_prod_IDs = [ l for l in prod_id_list if product_database_dict[l] ["predicted_product_gender"]["final_label(s)"] == target_product_gender ] else: potential_match_local_prod_IDs = prod_id_list similarity_to_target_prod = [ 0.0 for h in range(len(potential_match_local_prod_IDs)) ] for potential_match_index_m in tqdm( range(len(potential_match_local_prod_IDs))): prod_id_m = potential_match_local_prod_IDs[ potential_match_index_m] similarity_to_target_prod[ potential_match_index_m] = cosine_sim( allprods_info["ctl_img_embedding"][i], product_database_dict[prod_id_m]["embeddings"] ["img_ResNet50"]["local"]) recommended_prods += [ potential_match_local_prod_IDs[np.argmax( similarity_to_target_prod)] ] except: print(f"FAILED: collage {collage_j} product i={i}") recommended_ctl_outfits[collage_j] = recommended_prods return recommended_ctl_outfits
female_reference_embeddings = [] female_ref_ids = [] for ref_f in tqdm( os.listdir(female_reference_img_embeddings_dir) ): #tqdm( np.random.choice( os.listdir(female_reference_img_embeddings_dir), size=ref_sample_size, replace=False) ): female_ref_ids += [re.sub("\.npy", "", ref_f)] female_reference_embeddings += [ np.load(f"{female_reference_img_embeddings_dir}{ref_f}") ] combined_reference_embeddings = male_reference_embeddings + female_reference_embeddings combined_reference_gender_labels = ["male"] * len( male_reference_embeddings) + ["female"] * len(female_reference_embeddings) for ctl_prod_j in tqdm(os.listdir(individual_product_img_dir)): prod_id_j = re.sub("\.png", "", ctl_prod_j) prod_j_embedding = np.load( f"{individual_product_img_embeddings_dir}{prod_id_j}.npy") sim_to_target_product = [] for ref_embed_b in combined_reference_embeddings: sim_to_target_product += [cosine_sim(prod_j_embedding, ref_embed_b)] k_closest_ref_indices = np.argpartition(sim_to_target_product, -k_neighbours)[-k_neighbours:] k_closest_ref_true_gender_labels = np.array( [combined_reference_gender_labels[x] for x in k_closest_ref_indices]) female_proportion = (k_closest_ref_true_gender_labels == "female" ).sum() / len(k_closest_ref_true_gender_labels) with open(f"{desired_output_dir}{prod_id_j}.txt", "w") as f: f.write(str(female_proportion))
# try a random product -------------------------------------------------------------------------------------------------------------- k_neighbours = 100 target_prod_id = re.sub("\.png", "", np.random.choice(os.listdir(target_prod_images_dir))) target_prod_image = Image.open(f"{target_prod_images_dir}{target_prod_id}.png") target_prod_img_embedding = np.load( f"{target_prod_embeddings_dir}{target_prod_id}.npy") start_time = timeit.default_timer() # k closest male reference product(s): male_sim_to_target_product = [] for ref_embed_m in male_reference_embeddings: male_sim_to_target_product += [ cosine_sim(target_prod_img_embedding, ref_embed_m) ] k_closest_ref_male_indices = np.argpartition(male_sim_to_target_product, -k_neighbours)[-k_neighbours:] # k closest female reference product(s): female_sim_to_target_product = [] for ref_embed_f in female_reference_embeddings: female_sim_to_target_product += [ cosine_sim(target_prod_img_embedding, ref_embed_f) ] k_closest_ref_female_indices = np.argpartition(female_sim_to_target_product, -k_neighbours)[-k_neighbours:] #target_prod_image.show() #for i in k_closest_ref_male_indices: