def process_hits(self, filter_nb): last_hits = SearchHit.objects.all() processed_hits = [] for hit in last_hits: query = hit.query # blacklist if query in config.HISTORY_BLACKLIST: continue if hit.nb_results < filter_nb: SearchHitHistoric(query=hit.query, nb_results=hit.nb_results, date=hit.date).save() hit.delete() continue # manual get_or_create try: search_query = SearchQuery.objects.get(query=query) created = False except SearchQuery.DoesNotExist: search_query = SearchQuery(query=query) created = True # if it's a new one, initialize it if created: search_query.phonex = phonex(query) # clean the query, the '_' char cause bugy clean_query query = query.replace('_', '') lems = lemmatize(query.split()) clean_query = [lem for lem in lems if lem] clean_query = ' '.join(clean_query) clean_phonex = phonex(clean_query) search_query.clean_query = clean_query search_query.clean_phonex = clean_phonex search_query.nb_total_search = 0 search_query.pondered_search_nb = 0 search_query.nb_recent_search = 0 search_query.nb_results = hit.nb_results search_query.nb_total_search += 1 search_query.pondered_search_nb += 1 search_query.nb_recent_search += 1 weight = (search_query.pondered_search_nb * config.HISTORY_BETA + search_query.nb_results * config.HISTORY_GAMMA) search_query.weight = weight search_query.save() # we can now create SearchHitHistoric SearchHitHistoric(query=hit.query, nb_results=hit.nb_results, date=hit.date).save() hit.delete()
def highlight(text, words, index = None): """ Give the position of words in a text, cleaning everything as sesql does That can be used to highlight the words, for example The index will be use to lemmatize, if none, it'll use the default one """ if not text: return [] if index is None: index = fieldmap.primary if index is None: raise ValueError, "Not index given and no primary one" size = len(text) letters = set(string.ascii_letters) # Lemmatize the words lems = lemmatize(words, index) # Marshall everything text = index.marshall(text, use_cleanup = False) # Now find the lemmatized words inside the text found = [] foundwords = set() for i, lem in enumerate(lems): if not lem: continue wordsize = len(lem) pos = 0 while True: begin = text.find(lem, pos) if begin < 0: break end = begin + wordsize # We found something, ensure it's a normal word if begin and text[begin - 1] in letters: pos = end continue # Now find the end of the word while end < size and text[end] in letters: end += 1 found.append((begin, end, i)) foundwords.add(text[begin:end]) pos = end # Lemmatize all found words foundwords = list(foundwords) foundlems = lemmatize(foundwords, index) foundlems = dict(zip(foundwords, foundlems)) # And now, second pass, ensure lemmatized version of word is word results = [] for begin, end, i in found: word = text[begin:end] lem = foundlems[word] wanted_lem = lems[i] if lem == wanted_lem: results.append((begin, end, i)) return results