def _table_to_events(table, base_date, p = None): """Given a table html element as a BeautifulSoup, returns a list of """ p = param_defaults(p or {}) def get_rowspan(td): s = td.get('rowspan') if s == None: return None try: i = int(s) except ValueError: return None if i >= 0: return i else: return None events = [] year_col_index = None date_col_index = None for row in table.find_all('tr'): cells = row.find_all('th') for i, cell in enumerate(cells): cell_text = cell.get_text().strip().lower() if cell_text == 'year': year_col_index = i elif cell_text == 'date': date_col_index = i if date_col_index != None and year_col_index == None: year_col_index = date_col_index date_col_index = None if year_col_index == None and date_col_index == None: # just try using the first column. could be a bit smarter about giving # up early to save some cycles... year_col_index = 0 if year_col_index != None or date_col_index != None: # a td that has a rowspan will be stored as (col_index, cell) The # rowspan number essentially gets decremented in the td element each # time it is added to the subsequent row rowspans = [] # only used if split_within_row is True open_rowspans = {} for row in table.find_all('tr'): cells = row.find_all('td') # first, apply existing rowspans for (i, cell) in rowspans: if get_rowspan(cell) > 0: cells.insert(i, cell) # then, recollect existing and new rowspans rowspans = [] for (i, cell) in enumerate(cells): rs = get_rowspan(cell) if rs: cell['rowspan'] = rs - 1 rowspans.append((i, cell)) if len(cells) == 0 and len(row.find_all('th')) == 1: cells = row.find_all('th') if len(cells) == 1: extract = parse_date_html(_bs_inner_html(cells[0])) if extract: base_date = TimelineDate.combine(base_date, extract[0]) events.append({ 'date': base_date.start_year(), 'date_length': base_date.length(), 'date_string': extract[1], 'content': extract[2] }) elif len(cells) > year_col_index: extract = parse_date_html(_bs_inner_html(cells[year_col_index])) if extract: date = extract[0] date_string = extract[1] if date_col_index != None and len(cells) > date_col_index: extract2 = parse_date_html(_bs_inner_html(cells[date_col_index])) if extract2: date = TimelineDate.combine(date, extract2[0]) date_string += ' ' + extract2[1] date = TimelineDate.combine(base_date, date) content_cells = [cell for (i, cell) in \ enumerate(cells) if i != year_col_index and i != date_col_index] if p['keep_row_together']: content = ' '.join(_bs_inner_html(cell) for cell in content_cells) events.append({ 'date': date.start_year(), 'date_length': date.length(), 'date_string': date_string, 'content': content }) else: # deal with rowspan cells rowspan_cells = [cell for cell in content_cells if get_rowspan(cell) != None] for cell in rowspan_cells: if _bs_inner_html(cell) not in open_rowspans: open_rowspans[_bs_inner_html(cell)] = (date, date_string) elif get_rowspan(cell) <= 0: # and in open_rowspans, implicitly rowspan_start = open_rowspans[_bs_inner_html(cell)] rowspan_date = TimelineDate.span_from_dates(rowspan_start[0], date) rowspan_date_string = rowspan_start[1] + ' - ' + date_string for line in _lines_from_html(cell): events.append({ 'date': rowspan_date.start_year(), 'date_length': rowspan_date.length(), 'date_string': rowspan_date_string, 'content': line }) # deal with non-rowspan cells for cell in content_cells: if get_rowspan(cell) == None: for line in _lines_from_html(cell): events.append({ 'date': date.start_year(), 'date_length': date.length(), 'date_string': date_string, 'content': line }) return events