def run_process(self, html): self.soup = BeautifulSoup(html, 'html.parser') self.tables = [] for table in self.soup.find_all('table', recursive=False): rows = len(table.find_all('tr')) cols = len(table.find_all('td')) self.tables.append({ 'rows': rows, 'cols': cols//rows, 'curr_row': -1, 'curr_col': -1, }) self.feed(remove_whitespace(html))
def ignore_nested_tables(self, tables_soup): """ Returns array containing only the highest level tables Operates on the assumption that bs4 returns child elements immediately after the parent element in `find_all`. If this changes in the future, this method will need to be updated :return: """ new_tables = [] nest = 0 for table in tables_soup: if nest: nest -= 1 continue new_tables.append(table) nest = len(table.find_all('table')) return new_tables