def process_lines(self, lines):
        """
        Convert the given input into a list of SoupString rows
        for further processing.
        """

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise core.OptionalTableImportError(
                'BeautifulSoup must be '
                'installed to read HTML tables')

        if 'parser' not in self.html:
            soup = BeautifulSoup('\n'.join(lines))
        else:  # use a custom backend parser
            soup = BeautifulSoup('\n'.join(lines), self.html['parser'])
        tables = soup.find_all('table')
        for i, possible_table in enumerate(tables):
            if html.identify_table(possible_table, self.html, i + 1):
                table = possible_table  # Find the correct table
                break
        else:
            if isinstance(self.html['table_id'], int):
                err_descr = 'number {0}'.format(self.html['table_id'])
            else:
                err_descr = "id '{0}'".format(self.html['table_id'])
            raise core.InconsistentTableError(
                'ERROR: HTML table {0} not found'.format(err_descr))

        self.html['attrs'] = table.attrs
        # Get all table rows
        soup_list = [html.SoupString(x) for x in table.find_all('tr')]

        return soup_list
示例#2
0
def test_identify_table():
    """
    Test to make sure that identify_table() returns whether the
    given BeautifulSoup tag is the correct table to process.
    """

    # Should return False on non-<table> tags and None
    soup = BeautifulSoup('<html><body></body></html>')
    assert html.identify_table(soup, {}, 0) is False
    assert html.identify_table(None, {}, 0) is False

    soup = BeautifulSoup('<table id="foo"><tr><th>A</th></tr><tr>'
                         '<td>B</td></tr></table>').table
    assert html.identify_table(soup, {}, 2) is False
    assert html.identify_table(soup, {}, 1) is True  # Default index of 1

    # Same tests, but with explicit parameter
    assert html.identify_table(soup, {'table_id': 2}, 1) is False
    assert html.identify_table(soup, {'table_id': 1}, 1) is True

    # Test identification by string ID
    assert html.identify_table(soup, {'table_id': 'bar'}, 1) is False
    assert html.identify_table(soup, {'table_id': 'foo'}, 1) is True
示例#3
0
def test_identify_table():
    """
    Test to make sure that identify_table() returns whether the
    given BeautifulSoup tag is the correct table to process.
    """

    # Should return False on non-<table> tags and None
    soup = BeautifulSoup('<html><body></body></html>')
    assert html.identify_table(soup, {}, 0) is False
    assert html.identify_table(None, {}, 0) is False

    soup = BeautifulSoup('<table id="foo"><tr><th>A</th></tr><tr>'
                         '<td>B</td></tr></table>').table
    assert html.identify_table(soup, {}, 2) is False
    assert html.identify_table(soup, {}, 1) is True  # Default index of 1

    # Same tests, but with explicit parameter
    assert html.identify_table(soup, {'table_id': 2}, 1) is False
    assert html.identify_table(soup, {'table_id': 1}, 1) is True

    # Test identification by string ID
    assert html.identify_table(soup, {'table_id': 'bar'}, 1) is False
    assert html.identify_table(soup, {'table_id': 'foo'}, 1) is True