예제 #1
0
def _table_to_events(table, base_date, p = None):
	"""Given a table html element as a BeautifulSoup, returns a list of
	"""
	p = param_defaults(p or {})


	def get_rowspan(td):
		s = td.get('rowspan')
		if s == None:
			return None

		try:
			i = int(s)
		except ValueError:
			return None

		if i >= 0:
			return i
		else:
			return None

	events = []

	year_col_index = None
	date_col_index = None
	for row in table.find_all('tr'):
		cells = row.find_all('th')
		for i, cell in enumerate(cells):
			cell_text = cell.get_text().strip().lower()
			if cell_text == 'year': year_col_index = i
			elif cell_text == 'date': date_col_index = i
	if date_col_index != None and year_col_index == None:
		year_col_index = date_col_index
		date_col_index = None
	if year_col_index == None and date_col_index == None:
		# just try using the first column. could be a bit smarter about giving
		# up early to save some cycles...
		year_col_index = 0

	if year_col_index != None or date_col_index != None:
		# a td that has a rowspan will be stored as (col_index, cell) The
		# rowspan number essentially gets decremented in the td element each
		# time it is added to the subsequent row
		rowspans = []
		# only used if split_within_row is True
		open_rowspans = {}

		for row in table.find_all('tr'):
			cells = row.find_all('td')

			# first, apply existing rowspans
			for (i, cell) in rowspans:
				if get_rowspan(cell) > 0:
					cells.insert(i, cell)
			# then, recollect existing and new rowspans
			rowspans = []
			for (i, cell) in enumerate(cells):
				rs = get_rowspan(cell)
				if rs:
					cell['rowspan'] = rs - 1
					rowspans.append((i, cell))

			if len(cells) == 0 and len(row.find_all('th')) == 1:
				cells = row.find_all('th')

			if len(cells) == 1:
				extract = parse_date_html(_bs_inner_html(cells[0]))
				if extract:
					base_date = TimelineDate.combine(base_date, extract[0])
					events.append({
						'date': base_date.start_year(),
						'date_length': base_date.length(),
						'date_string': extract[1],
						'content': extract[2]
					})
			elif len(cells) > year_col_index:
				extract = parse_date_html(_bs_inner_html(cells[year_col_index]))
				if extract:
					date = extract[0]
					date_string = extract[1]
					if date_col_index != None and len(cells) > date_col_index:
						extract2 = parse_date_html(_bs_inner_html(cells[date_col_index]))
						if extract2:
							date = TimelineDate.combine(date, extract2[0])
							date_string += ' ' + extract2[1]
					date = TimelineDate.combine(base_date, date)
					content_cells = [cell for (i, cell) in \
						enumerate(cells) if i != year_col_index and i != date_col_index]
					if p['keep_row_together']:
						content = ' '.join(_bs_inner_html(cell) for cell in content_cells)
						events.append({
							'date': date.start_year(),
							'date_length': date.length(),
							'date_string': date_string,
							'content': content
						})
					else:
						# deal with rowspan cells
						rowspan_cells = [cell for cell in content_cells if get_rowspan(cell) != None]
						for cell in rowspan_cells:
							if _bs_inner_html(cell) not in open_rowspans:
								open_rowspans[_bs_inner_html(cell)] = (date, date_string)
							elif get_rowspan(cell) <= 0: # and in open_rowspans, implicitly
								rowspan_start = open_rowspans[_bs_inner_html(cell)]
								rowspan_date = TimelineDate.span_from_dates(rowspan_start[0], date)
								rowspan_date_string = rowspan_start[1] + ' - ' + date_string
								for line in _lines_from_html(cell):
									events.append({
										'date': rowspan_date.start_year(),
										'date_length': rowspan_date.length(),
										'date_string': rowspan_date_string,
										'content': line
									})

						# deal with non-rowspan cells
						for cell in content_cells:
							if get_rowspan(cell) == None:
								for line in _lines_from_html(cell):
									events.append({
										'date': date.start_year(),
										'date_length': date.length(),
										'date_string': date_string,
										'content': line
									})

	return events