def test_likelihood_kernel(self): eig_vals = 1 + rndm.geometric(p=0.5, size=self.rank) eig_vecs, _ = qr(rndm.randn(self.N, self.rank), mode='economic') dpp = FiniteDPP(kernel_type='likelihood', projection=False, **{'L': (eig_vecs * eig_vals).dot(eig_vecs.T)}) for size in self.sizes: for mode in ('GS', 'GS_bis', 'KuTa12'): dpp.flush_samples() for _ in range(self.nb_samples): dpp.sample_exact_k_dpp(size, mode) self.check_right_cardinality(dpp, dpp.list_of_samples) for mode in ('AED', 'AD'): dpp.flush_samples() dpp.sample_mcmc_k_dpp(size, **{'nb_iter': self.nb_samples}) self.check_right_cardinality(dpp, dpp.list_of_samples[0])
def test_likelihood_kernel_L_gram_factor(self): phi = rndm.randn(self.rank, self.N) dpp = FiniteDPP(kernel_type='likelihood', projection=False, **{'L_gram_factor': phi}) for size in self.sizes: for mode in ('GS', 'GS_bis', 'KuTa12'): dpp.flush_samples() for _ in range(self.nb_samples): dpp.sample_exact_k_dpp(size, mode) self.check_right_cardinality(dpp, dpp.list_of_samples) for mode in ('AED', 'AD'): dpp.flush_samples() dpp.sample_mcmc_k_dpp(size, **{'nb_iter': self.nb_samples}) self.check_right_cardinality(dpp, dpp.list_of_samples[0])
def test_kernel_eig(self): eig_vals = rndm.rand(self.rank) eig_vecs, _ = qr(rndm.randn(self.N, self.rank), mode='economic') dpp = FiniteDPP(kernel_type='correlation', projection=False, **{'K_eig_dec': (eig_vals, eig_vecs)}) for size in self.sizes: for mode in ('GS', 'GS_bis', 'KuTa12'): dpp.flush_samples() for _ in range(self.nb_samples): dpp.sample_exact_k_dpp(size, mode) self.check_right_cardinality(dpp, dpp.list_of_samples) for mode in ('AED', 'AD'): dpp.flush_samples() dpp.sample_mcmc_k_dpp(size, **{'nb_iter': self.nb_samples}) self.check_right_cardinality(dpp, dpp.list_of_samples[0])
def test_mcmc_sampler_basis_exchange(self): """ Test whether 'E' (basis_exchange) MCMC sampling mode generates samples with the right 1 and 2 points inclusion probabilities when DPP defined by orthogonal projection correlation kernel K from its eigendecomposition """ eig_vals = np.ones(self.rank) eig_vecs, _ = qr(rndm.randn(self.N, self.rank), mode='economic') dpp = FiniteDPP(kernel_type='correlation', projection=True, **{'K_eig_dec': (eig_vals, eig_vecs)}) dpp.sample_mcmc_k_dpp(size=self.rank, **{'nb_iter': 1000}) self.assertTrue(self.singleton_adequation(dpp, dpp.list_of_samples[0])) self.assertTrue(self.doubleton_adequation(dpp, dpp.list_of_samples[0]))
def get_recommendations(self, unseen_items, liked_items, indices, sim_matrix, user_id): zufall = random.random() liked_items_ind = liked_items.index unseen_items_ind = unseen_items.index liked_iloc = np.asarray([indices.index(v) for v in liked_items_ind]) not_seen_iloc = np.asarray( [indices.index(v) for v in unseen_items_ind]) result = sim_matrix[liked_iloc.ravel()] max_sim = np.amax(np.asarray(result), axis=0) max_not_seen = max_sim[not_seen_iloc] print(max_not_seen.shape) max_not_seen_lower = max_not_seen[(max_not_seen >= 0.5)] candidates = max_not_seen_lower.shape[0] print('Candidates: ', candidates) if (user_id % 2 == 1 and candidates >= 5): max_not_seen_ind = np.argpartition(max_not_seen, -10)[-10:] final_indices = unseen_items_ind[max_not_seen_ind] # time --> 0.003999233245849609 # final_recommendations sind die richtigen ids für die Beiträge! recommendation_df = unseen_items.loc[unseen_items.index.isin( final_indices)] recommendation_df['explanations'] = '' return recommendation_df if (user_id % 2 == 1 and candidates <= 5): return 0 if (user_id % 2 == 0 and candidates > 15): print('Kandidaten 1: ', candidates) print('Kandidaten: ', candidates) # Hier kann die Länge der ausgespuckten Liste verändert werden! if (candidates >= 30): max_not_seen_ind = np.argpartition(max_not_seen, -30)[-30:] else: max_not_seen_ind = np.argpartition(max_not_seen, -candidates)[-candidates:] final_indices = unseen_items_ind[max_not_seen_ind] print(final_indices.shape) #time --> 0.003999233245849609 #final_recommendations sind die richtigen ids für die Beiträge! recommendation_df = unseen_items.loc[unseen_items.index.isin( final_indices)] vectors = recommendation_df['final_vectors'].tolist() final_recommendations = list(recommendation_df.index) #Phi = np.array(vectors) #L = Phi.dot(Phi.T) L = self.cosine_similarity(np.asarray(vectors)) #print(dist_out_diversity) DPP = FiniteDPP('likelihood', **{'L': L}) if (candidates >= 30): k = 10 else: k = round(candidates / 3) DPP.flush_samples() #DPP.sample_exact_k_dpp(size=k) #for _ in range(2000): DPP.sample_mcmc_k_dpp(size=k, **{'nb_iter': 200}) #print(DPP.projection) #Die final_diversity_list speichert, ob ein vorgeschlagenes Element in den ersten 10 recommendations ist oder nicht! list_of_samples = DPP.list_of_samples[0] det_sim_list = [] for values in list_of_samples: final_diversity_list_indices = [ final_recommendations[i] for i in values ] recommendation_diversity_df = unseen_items.loc[ final_diversity_list_indices, :] vectors = recommendation_diversity_df['final_vectors'].tolist() dist_out_diversity = self.cosine_similarity( np.asarray(vectors)) det_sim_list.append(np.linalg.det(dist_out_diversity)) final_diversity_list = DPP.list_of_samples[0][np.argmax( det_sim_list)] #Indices der finalen diversität recommendations. final_diversity_list_indices = [ final_recommendations[i] for i in final_diversity_list ] recommendation_diversity_df = unseen_items.loc[ final_diversity_list_indices, :] # time --> 0.03799939155578613 # Get most similar documents... sim_list = [] for values in recommendation_diversity_df.final_vectors: most_similar_document = [] for vals in liked_items.final_vectors: result = 1 - spatial.distance.cosine(values, vals) most_similar_document.append(result) sim_list.append(most_similar_document) sim_doc_explanations = [] sim_doc_index = [] for values in range(0, len(sim_list)): max_doc = np.argmax(sim_list[values]) titles = liked_items['title'].tolist() sim_doc_index.append(liked_items.index[max_doc]) if (zufall <= 0.2): sim_doc_explanations.append( 'Das ähnlichste, von Ihnen gemochte Dokument zu diesem vorgeschlagenen Beitrag hat den Titel: ' + str(titles[max_doc]) + ' und die Audio_id: ' + str(liked_items.index[max_doc]) + '.') elif (zufall >= 0.2 and zufall <= 0.4): sim_doc_explanations.append( 'Dieser Beitrag wird Ihnen empfohlen, da Sie den Beitrag "' + str(titles[max_doc]) + '" mit der Audio ID: ' + str(liked_items.index[max_doc]) + ' mögen.') elif (zufall >= 0.4 and zufall <= 0.6): sim_doc_explanations.append( 'Der gerade empfohlene Beitrag wurde für Sie aufgrund des gemochten Beitrags: "' + str(titles[max_doc]) + '" mit der ID : ' + str(liked_items.index[max_doc]) + ' ausgewählt.') else: sim_doc_explanations.append( 'Sie könnten diesen Titel mögen, da er eine gewisse Ähnlichkeit zu dem Beitrag "' + str(titles[max_doc]) + '" besitzt, den Sie mögen. Dieser besitzt die ID: ' + str(liked_items.index[max_doc]) + '.') for values in range(0, len(recommendation_diversity_df)): words_1 = recommendation_diversity_df.final_nouns.values[ values] id = int(sim_doc_index[values]) words_2 = liked_items.final_nouns[id] sim_dict = {} for v in set(words_1): for n in set(words_2): word = str(v + ' ' + n) v_result = Vectors.query.filter( Vectors.word == v).first() v_vector = json.loads(v_result.vector) n_result = Vectors.query.filter( Vectors.word == n).first() n_vector = json.loads(n_result.vector) sim_dict[word] = dot(v_vector, n_vector) / ( norm(v_vector) * norm(n_vector)) top_3_sim_dict = ({ key: value for key, value in sim_dict.items() if value in heapq.nlargest(3, sim_dict.values()) }) sorted_dict = collections.OrderedDict(top_3_sim_dict) zufall_2 = random.random() for k, v in sorted_dict.items(): words = k.split() if v < 1 and v >= 0.5 and zufall_2 <= 0.33: sim_doc_explanations[values] = sim_doc_explanations[ values] + ' Aus dem gemochten Beitrag ist das Wort ' + ( str(words[0].capitalize()) + ' ähnlich zu dem Wort aus dem empfohlenen Beitrag ' + str(words[1].capitalize()) + '.') elif v < 1 and v >= 0.5 and zufall_2 >= 0.33 and zufall_2 <= 0.66: sim_doc_explanations[values] = sim_doc_explanations[ values] + ' Die zwei Worte ' + ( str(words[0].capitalize()) + ' und ' + str(words[1].capitalize()) + ' werden in den Beiträgen als ähnlich angesehen.' ) if v < 1 and v >= 0.5 and zufall_2 >= 0.66 and zufall_2 <= 1: sim_doc_explanations[values] = sim_doc_explanations[ values] + ' Der gemochte und der vorgeschlagene Beitrag werden sich durch die in Ihnen vorkommenden Worte ' + ( str(words[0].capitalize()) + ' und ' + str(words[1].capitalize()) + ' als ähnlich angesehen.') else: if (' Die beiden Beiträge beinhalten das identische Wort: ' + (str(words[0].capitalize()) + '.') not in sim_doc_explanations[values]): sim_doc_explanations[values] = sim_doc_explanations[ values] + ' Die beiden Beiträge beinhalten das identische Wort: ' + ( str(words[0].capitalize()) + '.') #time --> 0.0010020732879638672 recommendation_diversity_df['explanations'] = sim_doc_explanations return recommendation_diversity_df else: return 0