class HTMLSequenceWrapperRecord(object):
    def __init__(self, element, url, mintextlen=10):
        self.cleaner = SimpleHTMLCleaner()
        self.mintextlen = mintextlen
        self.elem = element
        self.url = url

        # the whole text
        self.text = self.elem.text_content()
        self.text = self.cleaner.clean(self.text)

        self.chunks = []
        self.__extract_chunks(self.elem)

    def has_value(self):
        if self.cleaner.contains_text(self.text) == False:
            return False
        return len(self.text) > self.mintextlen

    def get_chunks(self):
        return self.chunks

    def get_text(self):
        return self.text

    def _handle_elem(self, elem):
        if elem.text == None: return None
        if not self.cleaner.contains_text(elem): return None
        # new chunk
        chunk = TextChunk()

        ## extracting links
        if elem.get('href') != None:
            chunk.set_link(elem.get('href'))
        # extracting 'title' atribute in anchor
        if elem.tag == 'a' and elem.get('title') != None:
            chunk.set_comment(elem.get('title'))

        # extracting text
        txt = elem.text_content()
        chunk.set_text(self.cleaner.clean(txt))

        # setting style
        fs = elem.style
        chunk.set_style(fs)
        chunk.set_tag(elem.tag)
        return chunk

    def __extract_chunks(self, elem):
        thischunk = self._handle_elem(elem)
        if thischunk != None:
            self.chunks.append(thischunk)
        for child in elem.iterchildren():
            self.__extract_chunks(child)

    def __str__(self):
        return "<" + __modulename__ + ".HTMLSequenceWrapperRecord instance " + self.text + " >"
class HTMLSequenceWrapperRecord(object):
    def __init__(self, element, url, mintextlen=10):
        self.cleaner = SimpleHTMLCleaner()
        self.mintextlen = mintextlen
        self.elem = element
        self.url = url

        # the whole text
        self.text = self.elem.text_content()
        self.text = self.cleaner.clean(self.text)

        self.chunks = []
        self.__extract_chunks(self.elem)


    def has_value(self):
        if self.cleaner.contains_text(self.text) == False:
            return False
        return len(self.text) > self.mintextlen


    def get_chunks(self):
        return self.chunks


    def get_text(self):
        return self.text


    def _handle_elem(self, elem):
        if elem.text == None: return None
        if not self.cleaner.contains_text(elem): return None
        # new chunk
        chunk = TextChunk()

        ## extracting links
        if elem.get('href') != None:
            chunk.set_link(elem.get('href'))
        # extracting 'title' atribute in anchor
        if elem.tag == 'a' and elem.get('title') != None:
            chunk.set_comment(elem.get('title'))

        # extracting text
        txt = elem.text_content()
        chunk.set_text(self.cleaner.clean(txt))

        # setting style
        fs = elem.style
        chunk.set_style(fs)
        chunk.set_tag(elem.tag)
        return chunk


    def __extract_chunks(self, elem):
        thischunk = self._handle_elem(elem)
        if thischunk != None:
            self.chunks.append(thischunk)
        for child in elem.iterchildren():
            self.__extract_chunks(child)


    def __str__(self):
        return "<"+__modulename__+".HTMLSequenceWrapperRecord instance " + self.text + " >"