def clean_steam(): documents = Document.objects.all() goal = 0 current = 0 leng = len(documents) for document in documents: goal, current = avance(current, leng, goal) if document.steamed_content: text_to_clean = document.steamed_content aux = unicode(text_to_clean) #Quito <>,[], <!-- --> y saltos de linea aux = strip_tags(aux) #quito espacios en bordes, llevo a lowercase y saco tildes aux = ' '+remove_non_unicode(aux.strip().lower())+' ' #quito Numeros y Caracteres aux = remove_non_alphanumeric(aux) #quito espacios aux = remove_spaces(aux) document.steamed_content = aux document.save()
def clean_content(self, stopwords=None, s_t=True): if not self.cleaned_content: aux = unicode(self.original_content) # Quito <>,[], <!-- --> y saltos de linea if s_t: aux = strip_tags(aux) # quito espacios en bordes, llevo a lowercase y saco tildes aux = " " + remove_non_unicode(aux.strip().lower()) + " " # quito Numeros y Caracteres aux = remove_non_alphanumeric(aux) # quito espacios aux = remove_spaces(aux) # quito Stop Words if stopwords is None: sw = Stopword.objects.all() stopwords = "|".join([" " + str(x) + " " for x in sw]) if stopwords: aux = remove_words(aux, stopwords) else: print "Document %s: There aren't any stop words!" % self.id aux = aux.replace(" ", " ") self.cleaned_content = aux.strip()