def process_lines(self, lines): """ Convert the given input into a list of SoupString rows for further processing. """ try: from bs4 import BeautifulSoup except ImportError: raise core.OptionalTableImportError( 'BeautifulSoup must be ' 'installed to read HTML tables') if 'parser' not in self.html: soup = BeautifulSoup('\n'.join(lines)) else: # use a custom backend parser soup = BeautifulSoup('\n'.join(lines), self.html['parser']) tables = soup.find_all('table') for i, possible_table in enumerate(tables): if html.identify_table(possible_table, self.html, i + 1): table = possible_table # Find the correct table break else: if isinstance(self.html['table_id'], int): err_descr = 'number {0}'.format(self.html['table_id']) else: err_descr = "id '{0}'".format(self.html['table_id']) raise core.InconsistentTableError( 'ERROR: HTML table {0} not found'.format(err_descr)) self.html['attrs'] = table.attrs # Get all table rows soup_list = [html.SoupString(x) for x in table.find_all('tr')] return soup_list
def test_identify_table(): """ Test to make sure that identify_table() returns whether the given BeautifulSoup tag is the correct table to process. """ # Should return False on non-<table> tags and None soup = BeautifulSoup('<html><body></body></html>') assert html.identify_table(soup, {}, 0) is False assert html.identify_table(None, {}, 0) is False soup = BeautifulSoup('<table id="foo"><tr><th>A</th></tr><tr>' '<td>B</td></tr></table>').table assert html.identify_table(soup, {}, 2) is False assert html.identify_table(soup, {}, 1) is True # Default index of 1 # Same tests, but with explicit parameter assert html.identify_table(soup, {'table_id': 2}, 1) is False assert html.identify_table(soup, {'table_id': 1}, 1) is True # Test identification by string ID assert html.identify_table(soup, {'table_id': 'bar'}, 1) is False assert html.identify_table(soup, {'table_id': 'foo'}, 1) is True