Exemplo n.º 1
0
class TableParser(HTMLParser):
    VALID_TAGS = ['table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td']

    def __init__(self, header_row_fn=None, body_row_fn=None, footer_row_fn=None):
        HTMLParser.__init__(self)
        self.state_machine = StateMachine()
        self.header_row_fn = header_row_fn
        self.body_row_fn = body_row_fn
        self.footer_row_fn = footer_row_fn
        self.cells = []
        
    def handle_starttag(self, tag, attrs):
        if tag not in TableParser.VALID_TAGS:
            return
        self.state_machine.transition(tag, StateMachine.START_TAG)
    
    def handle_endtag(self, tag):
        if tag not in TableParser.VALID_TAGS:
            return
        self.state_machine.transition(tag, StateMachine.END_TAG)
        if tag == 'tr':
            if self.state_machine.in_header() and self.header_row_fn:
                self.header_row_fn(self.cells)
            elif self.state_machine.in_footer() and self.footer_row_fn:
                self.footer_row_fn(self.cells)
            elif self.body_row_fn:
                self.body_row_fn(self.cells)
            self.cells = []

    def handle_data(self, data):
        if self.state_machine.in_cell():
            self.cells.append(data)