def save(self, *args, **kwargs): self.content_length = len(self.content) content = re.sub('[!-@[-`]', ' ', self.content) content = re.sub(' +', ' ', self.content) self.content_word_count = wordcount(content) self.content_ascii = unicode_to_ascii(self.content) self.subject_title_ascii = unicode_to_ascii(self.subject_title) super(ScrappedDocument, self).save(*args, **kwargs)
def update_word_stats(sender, instance, created, using, **kwargs): if not created: return text = ''.join((instance.content_ascii, instance.subject_title_ascii)) text = unicode_to_ascii(text).lower() for word, count in count_words(text): Word.objects.increase_count(word, count)
def cognate_words(self, words): if isinstance(words, unicode): words = str(unicode_to_ascii(words)) if isinstance(str): words = re.split('\w+', words) base_words = self.filter(word__in=words).values_list('base', flat=True) return self.filter(base_in=base_words)
def _normalize(self, query): query = unicode_to_ascii(query) query = re.sub('\W+', ' ', query) return query