Пример #1
0
	def test_span2(self):
		splitter = HtmlSplitter('abcdefghijkl')
		self.assertEqual(unicode(splitter.get_span(2, 5)),
			u'cde')
		self.assertEqual(unicode(splitter.get_span(5, 12)),
			u'fghijkl')
		self.assertEqual(unicode(splitter.get_span(5, 13)),
			u'fghijkl')
Пример #2
0
def _separate_events(events):
	new_events = []
	for e in events:
		htmlsplitter = HtmlSplitter(e['content'])
		separated = (htmlsplitter.get_span(start, end) \
			for start, end in _sentence_splitter.span_tokenize(htmlsplitter.text_string))
		for s in separated:
			# not sure whether to go for interface consistency or not having to reparse
			new_events.append({'date': e['date'], 'date_string': e['date_string'], 'content': unicode(s)})
	return new_events
Пример #3
0
	def test_span1(self):
		splitter = HtmlSplitter(self.data)

		self.assertEqual(unicode(splitter.get_span(0, 5)),
			u'0abc4')
		self.assertEqual(unicode(splitter.get_span(5, 8)),
			u'<p><b><a>14d</a></b></p>')
		self.assertEqual(unicode(splitter.get_span(0, 9)),
			u'0abc4<p><b><a>14de</a></b></p>')
		self.assertEqual(unicode(splitter.get_span(2, 24)),
			u'bc4<p><b><a>14defg21</a></b>30hijk37</p>42m')
Пример #4
0
	def test_span4(self):
		splitter = HtmlSplitter('<p class="blah">abc<br/>def</p>ghi<br/>jkl<br/>')

		top_level_ranges = splitter._top_level_ranges
		self.assertEqual(
			[r['range'] for r in top_level_ranges],
			[(0, 6), (6, 9), (9, 9), (9, 12), (12, 12)])
		self.assertEqual(unicode(splitter.get_span(2, 5)),
			u'<p class="blah">c<br/>de</p>')
		self.assertEqual(unicode(splitter.get_span(3, 4)),
			u'<p class="blah">d</p>')
		self.assertEqual(unicode(splitter.get_span(5, 12)),
			u'<p class="blah">f</p>ghi<br/>jkl')
Пример #5
0
def parse_date_html(html_string):
	"""Takes a string that contains html, and returns (date, date_string,
	content) as a tuple. For now, date is an int that represents the year.
	Negative numbers are B.C. and positive are A.D. years. If there is no date
	that can be parsed, returns None.
	"""

	# preprocess to add newlines after <br />, or else get_text smushes things
	# together
	soup = BeautifulSoup(html_string)
	for el in soup.descendants:
		if el.name == 'br':
			el.insert_after(soup.new_string('\n'))
			el.insert_before(soup.new_string('\n'))

	html_splitter = HtmlSplitter(unicode(soup))
	s = html_splitter.text_string

	content_offset = 0

	# strip out all non-letter/digit characters from the beginning
	m = re.search('^[^\d\w]+', s)
	if m:
		content_offset += m.end()
	if not s:
		return None

	# get the date
	extract = parse_date_text(s[content_offset:])
	if not extract:
		return None
	(date, date_index) = extract
	date_string = html_splitter.get_span(content_offset, date_index + content_offset)

	content_offset += date_index

	# strip out any transition characters between the date and the content
	m = re.search(u'^[\s\-–—:\.]+', s[content_offset:])
	if m:
		content_offset += m.end()

	content = '' if content_offset >= len(s) \
		else html_splitter.get_span(content_offset, len(s))

	return (date, date_string, content)
Пример #6
0
	def test_ranges(self):
		splitter = HtmlSplitter(self.data)
		top_level_ranges = splitter._top_level_ranges
		self.assertEqual(
			[r['range'] for r in top_level_ranges],
			[(0, 5), (5, 21), (21, 29)])
		self.assertEqual(
			[r['range'] for r in splitter._get_applicable_ranges(
				top_level_ranges,
				0, 5)],
			[(0, 5)])
		self.assertEqual(
			[r['range'] for r in splitter._get_applicable_ranges(
				top_level_ranges,
				22, 29)],
			[(21, 29)])
		self.assertEqual(
			[r['range'] for r in splitter._get_applicable_ranges(
				top_level_ranges,
				3, 7)],
			[(0, 5), (5, 21)])
Пример #7
0
	def test_empty(self):
		splitter = HtmlSplitter('')

		self.assertEqual(splitter.get_span(0, 0), u'')
		self.assertEqual(splitter.get_span(0, 5), u'')
		self.assertEqual(splitter.get_span(-5, 5), u'')

		splitter = HtmlSplitter('<p></p>')

		self.assertEqual(splitter.get_span(0, 0), u'')
		self.assertEqual(splitter.get_span(0, 5), u'<p></p>')
		self.assertEqual(splitter.get_span(-5, 5), u'<p></p>')

		splitter = HtmlSplitter('<p>hello</p><p></p><p>there</p>')

		self.assertEqual(splitter.get_span(0, 0), u'')
		self.assertEqual(splitter.get_span(0, 5), u'<p>hello</p>')
		self.assertEqual(splitter.get_span(-5, 7), u'<p>hello</p><p></p><p>th</p>')
Пример #8
0
	def test_out_of_range(self):
		splitter = HtmlSplitter('<p>blah</p>')

		self.assertEqual(splitter.get_span(0, 200), u'<p>blah</p>')
		self.assertEqual(splitter.get_span(-5, 3), u'<p>bla</p>')
Пример #9
0
	def test_span3(self):
		splitter = HtmlSplitter('<p class="blah">abcdefghijkl</p>')
		self.assertEqual(unicode(splitter.get_span(2, 5)),
			u'<p class="blah">cde</p>')
		self.assertEqual(unicode(splitter.get_span(5, 12)),
			u'<p class="blah">fghijkl</p>')