def test_profile_evolution(self): """Test that a profile evolves with a normal context""" categories = list(DmozCategory.get_for_query(query="study")) other_cats = list(DmozCategory.get_for_query(query="Einstein")) docs = list(DocumentSurrogate.objects.filter(category__in=categories).values_list('pk', flat=True)) #first pass: both are added r = update_profile.apply(args=[self.profile, 'Einstein', docs], kwargs={'lang': 'en', 'terms': True}) p = self.profile.preferences.all() before_boost = {} [before_boost.update({e.category.pk: e.score}) for e in p if e.category in categories] before_decay = {} [before_decay.update({e.category.pk: e.score}) for e in p if e.category in other_cats] #second pass: only the docs: r = update_profile.apply(args=[self.profile, [], docs], kwargs={'lang': 'en', 'terms': True}) q = self.profile.preferences.all() after_boost = {} [after_boost.update({e.category.pk: e.score}) for e in q if e.category in categories] after_decay = {} [after_decay.update({e.category.pk: e.score}) for e in q if e.category in other_cats] #the docs must've been boosted, the others, decayed: #REMEMBER: if a preferences gets to 1, it can't go farther up!! (100% is the maximum score) self.assertFalse([k for k,v in after_boost.items() if after_boost[k] < before_boost[k]] +[k for k,v in after_decay.items() if after_decay[k] >= before_decay[k]])
def test_as_dict(self): """Test the dictionary option""" results = DmozCategory.get_for_query(self.query) dict_results = DmozCategory.get_for_query(self.query, as_dict=True) result = dict_results.keys().sort() expected = [e.pk for e in results].sort() self.assertEqual(result, expected)
def test_profile_creation_query(self): """Test that a profile is created with a direct query and documents""" categories = DmozCategory.get_for_query(query="study") other_cats = DmozCategory.get_for_query(query="Einstein") docs = list(DocumentSurrogate.objects.filter(category__in=categories).values_list('pk', flat=True)) r = update_profile.apply(args=[self.profile, 'Einstein', docs], kwargs={'lang': 'en', 'terms': True}) self.assert_(self._check_expected_profile(self.profile, itertools.chain(categories, other_cats)))
def test_profile_expansion(self): """Test that a profile behaves properly with new preferences""" categories = DmozCategory.get_for_query(query="study") other_cats = DmozCategory.get_for_query(query="Einstein") docs = list(DocumentSurrogate.objects.filter(category__in=categories).values_list('pk', flat=True)) #first pass: both are added r = update_profile.apply(args=[self.profile, 'Einstein'], kwargs={'lang': 'en', 'terms': True}) before = self.profile.preferences.count() r = update_profile.apply(args=[self.profile, [], docs], kwargs={'lang': 'en', 'terms': True}) after = self.profile.preferences.count() self.assert_(after > before)
def test_query(self): """Test that a query returns an iterable of categories""" results = DmozCategory.get_for_query(self.query) self.assert_((len(results) and hasattr(results, '__iter__') and isinstance(results[0], DmozCategory)))
def test_category_list(self): """Test that when given ambiguous queries the classifier returns a list of candidates""" results = sorted([e.pk for e in DmozCategory.get_for_query(self.query)]) expected = DmozCategory.objects.filter( title__in=self.query_candidates).order_by('pk').values_list('pk', flat=True) self.assertEqual(results, list(expected))
def test_classify_terms_spanish(self): """Test that a profile is created with a direct query in spanish""" categories = DmozCategory.get_for_query(query="Einstein", lang='es') r = update_profile.apply(args=[self.profile, 'Einstein', []], kwargs={'lang': 'es', 'terms': True}) self.assert_(self._check_expected_profile(self.profile, categories))
def test_build_query_english(self): """Test that a profile is created given an english context""" categories = DmozCategory.get_for_query(query="Einstein", lang='en') ctx = """Einstein himself is well known for rejecting some of the claims of quantum mechanics.""" r = update_profile.apply(args=[self.profile, ctx, []], kwargs={'lang': 'en', 'terms': False}) self.assert_(self._check_expected_profile(self.profile, categories))
def test_classify_only_docs(self): """Test that a profile is created with only documents""" categories = DmozCategory.get_for_query(query="study") docs = list(DocumentSurrogate.objects.filter(category__in=categories).values_list('pk', flat=True)) r = update_profile.apply(args=[self.profile, [], docs]) self.assert_(self._check_expected_profile(self.profile, categories))
def test_build_query_spanish(self): """Test that a profile is created given a spanish context""" categories = DmozCategory.get_for_query(query="Einstein", lang='es') ctx = """El mismo Einstein es conocido por haber rechazado algunas de las demandas de la mecánica cuántica""" r = update_profile.apply(args=[self.profile, ctx, []], kwargs={'lang': 'es', 'terms': False}) self.assert_(self._check_expected_profile(self.profile, categories))
def test_spanish_query(self): """Test that a query in spanish works""" result = list(DmozCategory.get_for_query(self.exact_queries['es'], 'es')) if len(result) != 1: self.fail('Only expecting one category!') resultC = result[0] self.assertEqual(resultC, DmozCategory.objects.get(title=self.exact_candidate))
def test_english_query(self): """Test that a query for a category in english works Must return the quantum physics category and only that one! """ result = list(DmozCategory.get_for_query(self.exact_queries['en'], 'en')) if len(result) != 1: self.fail('Only expecting one category!') resultC = result[0] self.assertEqual(resultC, DmozCategory.objects.get(title=self.exact_candidate))
def test_empty_query(self): """Test that no categories are returned for a non-existant word""" self.assertFalse(DmozCategory.get_for_query('lorem ipsum'))
def test_empty_query_as_dict(self): """Test that the dict option returns an empty iterable if an empty query is ensued""" self.assertFalse(DmozCategory.get_for_query('lorem ipsum', as_dict=True))
def test_max_results(self): """Check that the max results constraint is respected""" self.assertEqual(len(DmozCategory.get_for_query(self.query, max_results=1)), 1)
def test_score(self): """Test that an important candidate receives a high score""" results = DmozCategory.get_for_query(self.query) self.assert_(results[0].relative_weight >= 0.98)
def test_score_range(self): """Test that scores are numbers in [0,100]""" results = DmozCategory.get_for_query(self.query) self.assertFalse([e for e in results if (e.relative_weight > 1 or e.relative_weight < 0)])
def test_classify_terms_incorrect_lang(self): """Test that a profile is created with the default language if a given one is not supported""" categories = DmozCategory.get_for_query(query="Einstein", lang='en') r = update_profile.apply(args=[self.profile, 'Einstein', []], kwargs={'lang': 'fr', 'terms': True}) self.assert_(self._check_expected_profile(self.profile, categories))
def test_invalid_lang_query(self): """Test that, given an invalid language code, the method defaults to the base lang (english)""" results = [e.pk for e in DmozCategory.get_for_query(self.query, lang='fr')].sort() expected = [e.pk for e in DmozCategory.get_for_query(self.query)].sort() self.assertEqual(results, expected)
def test_weighting(self): """Test that the categories receive a score""" results = DmozCategory.get_for_query(self.query) self.assert_(hasattr(results[0], 'relative_weight'))
def update_profile(profile, context, docs, lang='en', terms=True, **kwargs): """Update a profile with a spreading activation algorithm: determine the concepts in which the user might be interested, save the session and update the activation values, proceeding then to update the profile itself Args: profile: the client user context: the last context terms or text docs: the ids of the documents of our database the user found interesting lang: the language of the user terms: whether the context is already a string of index terms or a full text """ #build the context list: #context = context + list(DocumentSurrogate.) lang = lang if lang in [e[0] for e in settings.LANGUAGES] else 'en' #STEP 0: build the concepts set and set their activation values: CON = {} if not terms and context: context = build_query(context, language=lang) if not hasattr(context, '__iter__'): context = [context,] #Populate the concepts list with a dictionary of the form {concept: similarity} for d in context: CON.update(DmozCategory.get_for_query(d, lang, as_dict=True)) for d in DocumentSurrogate.objects.filter(pk__in=docs).values_list('category', flat=True).iterator(): #TODO: should I compute the document's summary similarity to its alleged category? CON.update({d:1.0}) #logging.debug("Concepts gathered %s" % CON) #Spreading: add to the interest list the attenuated weight of it's ancestors: for c in CON.keys(): curr_concept = c parent = DmozCategory.objects.filter(pk=curr_concept).values_list('parent', flat=True)[0] while parent: #multiple children of a parent might be in CON, ensure that the maximum score is the one that survives #by selecting the maximum each time ch_weight= DmozCategory.objects.filter(pk=curr_concept).values_list('weight', flat=True)[0] CON.update({parent: max(CON.get(parent, 0.0), CON[curr_concept] * ch_weight)}) curr_concept = parent parent = DmozCategory.objects.filter(pk=curr_concept).values_list('parent', flat=True)[0] #logging.debug("After propagation: %s" % CON) #STEP 2: Evolve the profile #Use linear combination to update existing_preferences = [] #logging.debug("Profile before update: %s" % profile.preferences.all()) for preference in profile.preferences.iterator(): #if the preference is not in this session, decay ctg = preference.category.pk if not ctg in CON: preference.score = DECAY*preference.score else: #it is, augment: preference.score = DECAY*preference.score + (1-DECAY)*CON[ctg] preference.save() #add the preference to the set of existing ones: existing_preferences += [ctg,] #determine which preferences to add to the profile: to_add = set(CON.keys()) - set(existing_preferences) for newcat in to_add: #pref = ClientPreference(category=DmozCategory.objects.get(pk=c), score=CON[newcat], user=profile) #DO NOT store zero weighted preferences: if CON[newcat]: new_pref = ClientPreference(category_id=newcat, score=CON[newcat], user=profile) new_pref.save() #logging.debug("Profile after update: %s" %profile.preferences.all()) return True