예제 #1
0
def string_blocks_to_events(string_blocks, p = None):
	"""Given a set of string blocks (as produced by html_to_string_blocks,
	expects that all strings are non-empty), returns a list of timeline
	events. A timeline event is {date: number, date_string: string, content: string}
	"""

	curr_ignore_sections = _ignore_sections.copy()
	p = param_defaults(p or {})

	def section_test(name):
		if p['single_section']:
			return name.strip().lower() == p['single_section'].strip().lower()
		else:
			return name.strip().lower() not in curr_ignore_sections

	if all(not section_test(sb['heading'][0]) for sb in string_blocks):
		# allow the first section to be processed if it is the only section,
		# excluding excluded sections like see also, etc. Usually this section
		# is just an intro paragraph, but if this if statement is true, it is
		# probably the entire content of the article
		try:
			curr_ignore_sections.remove('')
		except KeyError:
			pass
	if p['extra_ignore_sections']:
		for s in p['extra_ignore_sections'].split('&'):
			curr_ignore_sections.add(s.lower().strip())

	curr_event = None
	events = []

	for string_block in string_blocks:
		prev_date = None
		if section_test(string_block['heading'][0]):
			# create base date based on headings:
			# possible perf improvement by caching results for headings across string_blocks
			base_date = TimelineDate(TimePoint())
			base_date_string = ''
			for h in string_block['heading']:
				parse = parse_date_html(h)
				if parse:
					base_date = TimelineDate.combine(base_date, parse[0])
					base_date_string = parse[1]

			# if there's a year specified in the headings, we create a fuzzy
			# range that child elements of those headings need to fall in
			base_date_range = None
			if base_date.start_year() != None:
				delta_minus = 10
				delta_plus = 20
				m = re.search(ur'0+$', str(base_date.start.year))
				if m:
					delta_minus = int('1' + ('0' * (m.end() - m.start())))
					delta_plus = delta_minus * 2
				base_date_range = (base_date.start_year() - delta_minus, base_date.start_year() + delta_plus)

			for line in string_block['lines']:
				if line['line_type'] == LineTypes.line:
					parse = parse_date_html(line['line'])
					# if we can parse a date, create a new event
					if parse and \
						((not base_date_range) or \
						 (parse[0].start_year() == None) or \
						 (base_date_string.lower().strip() == 'antiquity') or \
						 (parse[0].start_year() >= base_date_range[0] and \
						 	parse[0].start_year() <= base_date_range[1]) or \
						 (TimelineDate.can_combine_as_day(base_date, parse[0]))
						 ):

						_close_event(events, curr_event)
						date = parse[0]
						if date.start_year() == None and prev_date:
							# this is the case where we have a month or
							# monthday but no year. in this case, take it from
							# the previous event
							date = TimelineDate.combine(prev_date, date)
						date = TimelineDate.combine(base_date, date)
						curr_event = {
							'date': date.start_year(),
							'date_length': date.length(),
							'date_string': parse[1],
							'content': parse[2]
						}
						prev_date = date
					# if we can't parse a date, append the line to the
					# current event if there is one
					elif curr_event:
						if p['continuations']:
							curr_event['content'] += _line_break + line['line']
						else:
							_close_event(events, curr_event)
							curr_event = {
								'date': curr_event['date'],
								'date_length': curr_event['date_length'],
								'date_string': curr_event['date_string'],
								'content': line['line']
							}
					# if there's no parse and no current event, see if we can
					# use the base_date
					elif base_date.start_year() != None:
						# no need to close events because curr_event is None
						curr_event = {
							'date': base_date.start_year(),
							'date_length': base_date.length(),
							'date_string': base_date_string,
							'content': line['line']
						}
				elif line['line_type'] == LineTypes.table:
					_close_event(events, curr_event)
					events += _table_to_events(line['line'], base_date, p)
					curr_event = None
			_close_event(events, curr_event)
			curr_event = None

	return events
예제 #2
0
def _table_to_events(table, base_date, p = None):
	"""Given a table html element as a BeautifulSoup, returns a list of
	"""
	p = param_defaults(p or {})


	def get_rowspan(td):
		s = td.get('rowspan')
		if s == None:
			return None

		try:
			i = int(s)
		except ValueError:
			return None

		if i >= 0:
			return i
		else:
			return None

	events = []

	year_col_index = None
	date_col_index = None
	for row in table.find_all('tr'):
		cells = row.find_all('th')
		for i, cell in enumerate(cells):
			cell_text = cell.get_text().strip().lower()
			if cell_text == 'year': year_col_index = i
			elif cell_text == 'date': date_col_index = i
	if date_col_index != None and year_col_index == None:
		year_col_index = date_col_index
		date_col_index = None
	if year_col_index == None and date_col_index == None:
		# just try using the first column. could be a bit smarter about giving
		# up early to save some cycles...
		year_col_index = 0

	if year_col_index != None or date_col_index != None:
		# a td that has a rowspan will be stored as (col_index, cell) The
		# rowspan number essentially gets decremented in the td element each
		# time it is added to the subsequent row
		rowspans = []
		# only used if split_within_row is True
		open_rowspans = {}

		for row in table.find_all('tr'):
			cells = row.find_all('td')

			# first, apply existing rowspans
			for (i, cell) in rowspans:
				if get_rowspan(cell) > 0:
					cells.insert(i, cell)
			# then, recollect existing and new rowspans
			rowspans = []
			for (i, cell) in enumerate(cells):
				rs = get_rowspan(cell)
				if rs:
					cell['rowspan'] = rs - 1
					rowspans.append((i, cell))

			if len(cells) == 0 and len(row.find_all('th')) == 1:
				cells = row.find_all('th')

			if len(cells) == 1:
				extract = parse_date_html(_bs_inner_html(cells[0]))
				if extract:
					base_date = TimelineDate.combine(base_date, extract[0])
					events.append({
						'date': base_date.start_year(),
						'date_length': base_date.length(),
						'date_string': extract[1],
						'content': extract[2]
					})
			elif len(cells) > year_col_index:
				extract = parse_date_html(_bs_inner_html(cells[year_col_index]))
				if extract:
					date = extract[0]
					date_string = extract[1]
					if date_col_index != None and len(cells) > date_col_index:
						extract2 = parse_date_html(_bs_inner_html(cells[date_col_index]))
						if extract2:
							date = TimelineDate.combine(date, extract2[0])
							date_string += ' ' + extract2[1]
					date = TimelineDate.combine(base_date, date)
					content_cells = [cell for (i, cell) in \
						enumerate(cells) if i != year_col_index and i != date_col_index]
					if p['keep_row_together']:
						content = ' '.join(_bs_inner_html(cell) for cell in content_cells)
						events.append({
							'date': date.start_year(),
							'date_length': date.length(),
							'date_string': date_string,
							'content': content
						})
					else:
						# deal with rowspan cells
						rowspan_cells = [cell for cell in content_cells if get_rowspan(cell) != None]
						for cell in rowspan_cells:
							if _bs_inner_html(cell) not in open_rowspans:
								open_rowspans[_bs_inner_html(cell)] = (date, date_string)
							elif get_rowspan(cell) <= 0: # and in open_rowspans, implicitly
								rowspan_start = open_rowspans[_bs_inner_html(cell)]
								rowspan_date = TimelineDate.span_from_dates(rowspan_start[0], date)
								rowspan_date_string = rowspan_start[1] + ' - ' + date_string
								for line in _lines_from_html(cell):
									events.append({
										'date': rowspan_date.start_year(),
										'date_length': rowspan_date.length(),
										'date_string': rowspan_date_string,
										'content': line
									})

						# deal with non-rowspan cells
						for cell in content_cells:
							if get_rowspan(cell) == None:
								for line in _lines_from_html(cell):
									events.append({
										'date': date.start_year(),
										'date_length': date.length(),
										'date_string': date_string,
										'content': line
									})

	return events
예제 #3
0
	def test_one(self):
		print(TimePoint.combine(TimePoint(1920, 4), TimePoint(None, 5, 3)))
		print(TimePoint.combine(TimePoint(1920), TimePoint(None, 4)))
		print(TimelineDate.combine(TimelineDate(TimePoint(1920, 8)), TimelineDate(TimePoint(4))))