def __init__(self, element, url, mintextlen=10): self.cleaner = SimpleHTMLCleaner() self.mintextlen = mintextlen self.elem = element self.url = url # the whole text self.text = self.elem.text_content() self.text = self.cleaner.clean(self.text) self.chunks = [] self.__extract_chunks(self.elem)
def __init__(self, childcoef=7.0, headercoef=4.0, mintextlen=10, omitted_tags=('option', 'br', 'select', 'form')): self.sequences = {} self.childcoef = childcoef self.headercoef = headercoef self.mintextlen = mintextlen self.omitted_tags = omitted_tags self.records = [] self.cleaner = SimpleHTMLCleaner()