def printHtmlTable(tableTag): """Convenience function for printing a stringified html data table""" soup = BeautifulSoup(''.join(tableTag), 'lxml') tableStr = u'' try: # print title #title = dt.article.title #tableStr += title + u'\n' # print 'Title: ' + title.encode('utf-8') table = soup.find('table') captionTag = soup.find('div', {'class':'table-caption'}) if captionTag is None: captionTag = soup.find('div', {'class':'auto-clean'}) if captionTag is not None: caption = get_tag_text(captionTag) # caption = ''.join(captionTag.findAll(text=True)) tableStr += caption + u'\n' rows = table.findAll('tr') for tr in rows: headers = tr.findAll('th') for th in headers: currText = get_tag_text(th) # currText = ''.join(th.findAll(text=True)) # if currText is None: # currText = '\t' text = u''.join(currText) tableStr += text +"|" cols = tr.findAll('td') for td in cols: currText = get_tag_text(td) # currText = ''.join(td.findAll(text=True)) # if currText is None: # currText = '\t' text = u''.join(currText) tableStr += text +"|" tableStr += u'\n' footnotesTag = soup.find('div', {'class':'table-foot'}) footnotes = get_tag_text(footnotesTag) tableStr += footnotes print tableStr.encode("iso-8859-15", "replace") return tableStr except (UnicodeDecodeError, UnicodeEncodeError): print 'Unicode printing failed!' return
def rep_html_table_struct(html_table_tag): """Returns a 2D table row-column representation of an input html data table Args: html_table_tag: the html table tag of a data table object, something that can be beautiful soupified Returns: data_table_rep: a 2D python table representation of the table text num_header_rows: number of rows in the table header id_table_rep: a 2D python table representation of the table where each cell element is the 'id' tag of the table cell Comments: function is lightly tested but should be robust to most ugly html tables provided by publishers """ soup = BeautifulSoup(''.join(html_table_tag), 'lxml') html_table = soup.find('table') # need to count column tags row_tags = html_table.findAll('tr') n_rows = 0 for tag in row_tags: temp_num_rows, temp_num_cols = get_row_col_width(tag) n_rows += temp_num_rows # finds maximum number of columns in any row n_cols = 0 for row_tag in row_tags: curr_col_tags = row_tag.findAll('td') curr_col_width = 0 for tag in curr_col_tags: temp_num_rows, temp_num_cols = get_row_col_width(tag) curr_col_width += temp_num_cols if curr_col_width > n_cols: n_cols = curr_col_width # initialize representation of data table as 2D python table data_table_rep = [[0 for i in range(n_cols)] for j in range(n_rows)] # initialize representation of data table as 2D python table composed of html id elements id_table_rep = [[0 for i in range(n_cols)] for j in range(n_rows)] row_cnt = 0 num_header_rows = 0 # iterate through all table rows for tr_html_tag in row_tags: header_tags = tr_html_tag.findAll('th') if len(header_tags) > 0: num_header_rows += 1 # counts the number of columns - I think?? # set colCnt by finding first non-zero element in table try: col_cnt = data_table_rep[row_cnt].index(0) except ValueError: print 'Table is likely f****d up!!!' data_table_rep = None id_table_rep = None return data_table_rep, 0, id_table_rep try: for th_html_tag in header_tags: cell_text = get_tag_text(th_html_tag) row_width, column_width = get_row_col_width(th_html_tag) for i in range(row_cnt, row_cnt + row_width): while data_table_rep[i][col_cnt] != 0: col_cnt += 1 for j in range(col_cnt, col_cnt + column_width): data_table_rep[i][j] = cell_text id_table_rep[i][j] = th_html_tag['id'] col_cnt += column_width except IndexError: continue col_tags = tr_html_tag.findAll('td') try: for td_html_tag in col_tags: cell_text = get_tag_text(td_html_tag) row_width, column_width = get_row_col_width(td_html_tag) for i in range(row_cnt, row_cnt + row_width): # need to check if current row and col already has an element while data_table_rep[i][col_cnt] != 0: col_cnt += 1 for j in range(col_cnt, col_cnt + column_width): data_table_rep[i][j] = cell_text id_table_rep[i][j] = td_html_tag['id'] col_cnt += column_width except IndexError: pass row_cnt += 1 return data_table_rep, num_header_rows, id_table_rep