示例#1
0
	def test_misc(self):
		self.helper((
			(u'',								None),
			(u'  ',								None),
			(u'<p></p>',						None),
			(u'<p>  </p>',						None),
			(u'lkjdr3f',						None),
			(u'1967 - 12th street massacre',	(TimelineDate(TimePoint(1967, year_approx = False)),	u'1967', u'12th street massacre')),
			(u'1245\xa0AD',				(TimelineDate(TimePoint(1245)),							u'1245\xa0AD',			u'')),
		))

		self.assertEqual(parse_date_html(u'900–929')[0].start_year(),	900)
		self.assertEqual(parse_date_html(u'1st century')[0].start_year(),	0)
示例#2
0
	def test_html_parsing(self):
		h1 = u'1890 <a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a>'
		d1 = u'1890'
		r1 = u'<a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a>'

		h2 = u'<b>1890 <a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a></b>'
		d2 = u'<b>1890</b>'
		r2 = u'<b><a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a></b>'

		h3 = u'<p><b>1890 <a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a></b></p>'
		d3 = u'<p><b>1890</b></p>'
		r3 = u'<p><b><a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a></b></p>'

		h4 = u'<p><b>1890 <a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a></b>blergh</p>'
		d4 = u'<p><b>1890</b></p>'
		r4 = u'<p><b><a href="/wiki/Stop_sign" title="Stop sign">Stop sign</a></b>blergh</p>'
		
		self.assertEqual(parse_date_html(h1), (TimelineDate(TimePoint(1890)), d1, r1))
		self.assertEqual(parse_date_html(h2), (TimelineDate(TimePoint(1890)), d2, r2))
		self.assertEqual(parse_date_html(h3), (TimelineDate(TimePoint(1890)), d3, r3))
		self.assertEqual(parse_date_html(h4), (TimelineDate(TimePoint(1890)), d4, r4))
示例#3
0
def _table_to_events(table, base_date, p = None):
	"""Given a table html element as a BeautifulSoup, returns a list of
	"""
	p = param_defaults(p or {})


	def get_rowspan(td):
		s = td.get('rowspan')
		if s == None:
			return None

		try:
			i = int(s)
		except ValueError:
			return None

		if i >= 0:
			return i
		else:
			return None

	events = []

	year_col_index = None
	date_col_index = None
	for row in table.find_all('tr'):
		cells = row.find_all('th')
		for i, cell in enumerate(cells):
			cell_text = cell.get_text().strip().lower()
			if cell_text == 'year': year_col_index = i
			elif cell_text == 'date': date_col_index = i
	if date_col_index != None and year_col_index == None:
		year_col_index = date_col_index
		date_col_index = None
	if year_col_index == None and date_col_index == None:
		# just try using the first column. could be a bit smarter about giving
		# up early to save some cycles...
		year_col_index = 0

	if year_col_index != None or date_col_index != None:
		# a td that has a rowspan will be stored as (col_index, cell) The
		# rowspan number essentially gets decremented in the td element each
		# time it is added to the subsequent row
		rowspans = []
		# only used if split_within_row is True
		open_rowspans = {}

		for row in table.find_all('tr'):
			cells = row.find_all('td')

			# first, apply existing rowspans
			for (i, cell) in rowspans:
				if get_rowspan(cell) > 0:
					cells.insert(i, cell)
			# then, recollect existing and new rowspans
			rowspans = []
			for (i, cell) in enumerate(cells):
				rs = get_rowspan(cell)
				if rs:
					cell['rowspan'] = rs - 1
					rowspans.append((i, cell))

			if len(cells) == 0 and len(row.find_all('th')) == 1:
				cells = row.find_all('th')

			if len(cells) == 1:
				extract = parse_date_html(_bs_inner_html(cells[0]))
				if extract:
					base_date = TimelineDate.combine(base_date, extract[0])
					events.append({
						'date': base_date.start_year(),
						'date_length': base_date.length(),
						'date_string': extract[1],
						'content': extract[2]
					})
			elif len(cells) > year_col_index:
				extract = parse_date_html(_bs_inner_html(cells[year_col_index]))
				if extract:
					date = extract[0]
					date_string = extract[1]
					if date_col_index != None and len(cells) > date_col_index:
						extract2 = parse_date_html(_bs_inner_html(cells[date_col_index]))
						if extract2:
							date = TimelineDate.combine(date, extract2[0])
							date_string += ' ' + extract2[1]
					date = TimelineDate.combine(base_date, date)
					content_cells = [cell for (i, cell) in \
						enumerate(cells) if i != year_col_index and i != date_col_index]
					if p['keep_row_together']:
						content = ' '.join(_bs_inner_html(cell) for cell in content_cells)
						events.append({
							'date': date.start_year(),
							'date_length': date.length(),
							'date_string': date_string,
							'content': content
						})
					else:
						# deal with rowspan cells
						rowspan_cells = [cell for cell in content_cells if get_rowspan(cell) != None]
						for cell in rowspan_cells:
							if _bs_inner_html(cell) not in open_rowspans:
								open_rowspans[_bs_inner_html(cell)] = (date, date_string)
							elif get_rowspan(cell) <= 0: # and in open_rowspans, implicitly
								rowspan_start = open_rowspans[_bs_inner_html(cell)]
								rowspan_date = TimelineDate.span_from_dates(rowspan_start[0], date)
								rowspan_date_string = rowspan_start[1] + ' - ' + date_string
								for line in _lines_from_html(cell):
									events.append({
										'date': rowspan_date.start_year(),
										'date_length': rowspan_date.length(),
										'date_string': rowspan_date_string,
										'content': line
									})

						# deal with non-rowspan cells
						for cell in content_cells:
							if get_rowspan(cell) == None:
								for line in _lines_from_html(cell):
									events.append({
										'date': date.start_year(),
										'date_length': date.length(),
										'date_string': date_string,
										'content': line
									})

	return events
示例#4
0
def string_blocks_to_events(string_blocks, p = None):
	"""Given a set of string blocks (as produced by html_to_string_blocks,
	expects that all strings are non-empty), returns a list of timeline
	events. A timeline event is {date: number, date_string: string, content: string}
	"""

	curr_ignore_sections = _ignore_sections.copy()
	p = param_defaults(p or {})

	def section_test(name):
		if p['single_section']:
			return name.strip().lower() == p['single_section'].strip().lower()
		else:
			return name.strip().lower() not in curr_ignore_sections

	if all(not section_test(sb['heading'][0]) for sb in string_blocks):
		# allow the first section to be processed if it is the only section,
		# excluding excluded sections like see also, etc. Usually this section
		# is just an intro paragraph, but if this if statement is true, it is
		# probably the entire content of the article
		try:
			curr_ignore_sections.remove('')
		except KeyError:
			pass
	if p['extra_ignore_sections']:
		for s in p['extra_ignore_sections'].split('&'):
			curr_ignore_sections.add(s.lower().strip())

	curr_event = None
	events = []

	for string_block in string_blocks:
		prev_date = None
		if section_test(string_block['heading'][0]):
			# create base date based on headings:
			# possible perf improvement by caching results for headings across string_blocks
			base_date = TimelineDate(TimePoint())
			base_date_string = ''
			for h in string_block['heading']:
				parse = parse_date_html(h)
				if parse:
					base_date = TimelineDate.combine(base_date, parse[0])
					base_date_string = parse[1]

			# if there's a year specified in the headings, we create a fuzzy
			# range that child elements of those headings need to fall in
			base_date_range = None
			if base_date.start_year() != None:
				delta_minus = 10
				delta_plus = 20
				m = re.search(ur'0+$', str(base_date.start.year))
				if m:
					delta_minus = int('1' + ('0' * (m.end() - m.start())))
					delta_plus = delta_minus * 2
				base_date_range = (base_date.start_year() - delta_minus, base_date.start_year() + delta_plus)

			for line in string_block['lines']:
				if line['line_type'] == LineTypes.line:
					parse = parse_date_html(line['line'])
					# if we can parse a date, create a new event
					if parse and \
						((not base_date_range) or \
						 (parse[0].start_year() == None) or \
						 (base_date_string.lower().strip() == 'antiquity') or \
						 (parse[0].start_year() >= base_date_range[0] and \
						 	parse[0].start_year() <= base_date_range[1]) or \
						 (TimelineDate.can_combine_as_day(base_date, parse[0]))
						 ):

						_close_event(events, curr_event)
						date = parse[0]
						if date.start_year() == None and prev_date:
							# this is the case where we have a month or
							# monthday but no year. in this case, take it from
							# the previous event
							date = TimelineDate.combine(prev_date, date)
						date = TimelineDate.combine(base_date, date)
						curr_event = {
							'date': date.start_year(),
							'date_length': date.length(),
							'date_string': parse[1],
							'content': parse[2]
						}
						prev_date = date
					# if we can't parse a date, append the line to the
					# current event if there is one
					elif curr_event:
						if p['continuations']:
							curr_event['content'] += _line_break + line['line']
						else:
							_close_event(events, curr_event)
							curr_event = {
								'date': curr_event['date'],
								'date_length': curr_event['date_length'],
								'date_string': curr_event['date_string'],
								'content': line['line']
							}
					# if there's no parse and no current event, see if we can
					# use the base_date
					elif base_date.start_year() != None:
						# no need to close events because curr_event is None
						curr_event = {
							'date': base_date.start_year(),
							'date_length': base_date.length(),
							'date_string': base_date_string,
							'content': line['line']
						}
				elif line['line_type'] == LineTypes.table:
					_close_event(events, curr_event)
					events += _table_to_events(line['line'], base_date, p)
					curr_event = None
			_close_event(events, curr_event)
			curr_event = None

	return events
示例#5
0
	def helper(self, data):
		warnings.filterwarnings('ignore', module='bs4')
		for row in data:
			self.assertEqual(parse_date_html(row[0]), row[1])
示例#6
0
	def test_real_life(self):
		d1 = u"""<li>613 BC, July – A <a href="/wiki/Comet" title="Comet">Comet</a>, possibly <a href="/wiki/Comet_Halley" title="Comet Halley" class="mw-redirect">Comet Halley</a>, is recorded in <a href="/wiki/Spring_and_Autumn_Annals" title="Spring and Autumn Annals">Spring and Autumn Annals</a> by the Chinese</li>"""
		self.assertEqual(parse_date_html(d1)[0], TimelineDate(TimePoint(-613 + 1)))