def test_span2(self): splitter = HtmlSplitter('abcdefghijkl') self.assertEqual(unicode(splitter.get_span(2, 5)), u'cde') self.assertEqual(unicode(splitter.get_span(5, 12)), u'fghijkl') self.assertEqual(unicode(splitter.get_span(5, 13)), u'fghijkl')
def _separate_events(events): new_events = [] for e in events: htmlsplitter = HtmlSplitter(e['content']) separated = (htmlsplitter.get_span(start, end) \ for start, end in _sentence_splitter.span_tokenize(htmlsplitter.text_string)) for s in separated: # not sure whether to go for interface consistency or not having to reparse new_events.append({'date': e['date'], 'date_string': e['date_string'], 'content': unicode(s)}) return new_events
def test_span1(self): splitter = HtmlSplitter(self.data) self.assertEqual(unicode(splitter.get_span(0, 5)), u'0abc4') self.assertEqual(unicode(splitter.get_span(5, 8)), u'<p><b><a>14d</a></b></p>') self.assertEqual(unicode(splitter.get_span(0, 9)), u'0abc4<p><b><a>14de</a></b></p>') self.assertEqual(unicode(splitter.get_span(2, 24)), u'bc4<p><b><a>14defg21</a></b>30hijk37</p>42m')
def test_span4(self): splitter = HtmlSplitter('<p class="blah">abc<br/>def</p>ghi<br/>jkl<br/>') top_level_ranges = splitter._top_level_ranges self.assertEqual( [r['range'] for r in top_level_ranges], [(0, 6), (6, 9), (9, 9), (9, 12), (12, 12)]) self.assertEqual(unicode(splitter.get_span(2, 5)), u'<p class="blah">c<br/>de</p>') self.assertEqual(unicode(splitter.get_span(3, 4)), u'<p class="blah">d</p>') self.assertEqual(unicode(splitter.get_span(5, 12)), u'<p class="blah">f</p>ghi<br/>jkl')
def parse_date_html(html_string): """Takes a string that contains html, and returns (date, date_string, content) as a tuple. For now, date is an int that represents the year. Negative numbers are B.C. and positive are A.D. years. If there is no date that can be parsed, returns None. """ # preprocess to add newlines after <br />, or else get_text smushes things # together soup = BeautifulSoup(html_string) for el in soup.descendants: if el.name == 'br': el.insert_after(soup.new_string('\n')) el.insert_before(soup.new_string('\n')) html_splitter = HtmlSplitter(unicode(soup)) s = html_splitter.text_string content_offset = 0 # strip out all non-letter/digit characters from the beginning m = re.search('^[^\d\w]+', s) if m: content_offset += m.end() if not s: return None # get the date extract = parse_date_text(s[content_offset:]) if not extract: return None (date, date_index) = extract date_string = html_splitter.get_span(content_offset, date_index + content_offset) content_offset += date_index # strip out any transition characters between the date and the content m = re.search(u'^[\s\-–—:\.]+', s[content_offset:]) if m: content_offset += m.end() content = '' if content_offset >= len(s) \ else html_splitter.get_span(content_offset, len(s)) return (date, date_string, content)
def test_ranges(self): splitter = HtmlSplitter(self.data) top_level_ranges = splitter._top_level_ranges self.assertEqual( [r['range'] for r in top_level_ranges], [(0, 5), (5, 21), (21, 29)]) self.assertEqual( [r['range'] for r in splitter._get_applicable_ranges( top_level_ranges, 0, 5)], [(0, 5)]) self.assertEqual( [r['range'] for r in splitter._get_applicable_ranges( top_level_ranges, 22, 29)], [(21, 29)]) self.assertEqual( [r['range'] for r in splitter._get_applicable_ranges( top_level_ranges, 3, 7)], [(0, 5), (5, 21)])
def test_empty(self): splitter = HtmlSplitter('') self.assertEqual(splitter.get_span(0, 0), u'') self.assertEqual(splitter.get_span(0, 5), u'') self.assertEqual(splitter.get_span(-5, 5), u'') splitter = HtmlSplitter('<p></p>') self.assertEqual(splitter.get_span(0, 0), u'') self.assertEqual(splitter.get_span(0, 5), u'<p></p>') self.assertEqual(splitter.get_span(-5, 5), u'<p></p>') splitter = HtmlSplitter('<p>hello</p><p></p><p>there</p>') self.assertEqual(splitter.get_span(0, 0), u'') self.assertEqual(splitter.get_span(0, 5), u'<p>hello</p>') self.assertEqual(splitter.get_span(-5, 7), u'<p>hello</p><p></p><p>th</p>')
def test_out_of_range(self): splitter = HtmlSplitter('<p>blah</p>') self.assertEqual(splitter.get_span(0, 200), u'<p>blah</p>') self.assertEqual(splitter.get_span(-5, 3), u'<p>bla</p>')
def test_span3(self): splitter = HtmlSplitter('<p class="blah">abcdefghijkl</p>') self.assertEqual(unicode(splitter.get_span(2, 5)), u'<p class="blah">cde</p>') self.assertEqual(unicode(splitter.get_span(5, 12)), u'<p class="blah">fghijkl</p>')