def render_html(self, highlight_locations=None, start_offset=None, end_offset=None): # Start by chopping the block down to the proper window. text = self.text_block[start_offset:end_offset] # Invert highlight_locations to a location -> term list term_list = [] for term, locations in highlight_locations.items(): term_list += [(loc - start_offset, term) for loc in locations] loc_to_term = sorted(term_list) # Prepare the highlight template if self.css_class: hl_start = '<%s class="%s">' % (self.html_tag, self.css_class) else: hl_start = '<%s>' % (self.html_tag) hl_end = '</%s>' % self.html_tag highlight_length = len(hl_start + hl_end) # Copy the part from the start of the string to the first match, # and there replace the match with a highlighted version. highlighted_chunk = "" matched_so_far = 0 prev = 0 prev_str = "" for cur, cur_str in loc_to_term: # This can be in a different case than cur_str actual_term = text[cur:cur + len(cur_str)] # Handle incorrect highlight_locations by first checking for the term if actual_term.lower() == cur_str: highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end prev = cur prev_str = cur_str # Keep track of how far we've copied so far, for the last step matched_so_far = cur + len(actual_term) # Don't forget the chunk after the last term highlighted_chunk += text[matched_so_far:] # Unicode characters at the end of highlighted_chunk might get split, # leaving behind partial encodings at the end that break Javascript. # So our highlighted_chunk could be "My heart is in the \u4". # # Remove the orphaned encodings by calling trim(). from haystack import trim highlighted_chunk = trim.trim(highlighted_chunk) if start_offset > 0: highlighted_chunk = '...%s' % highlighted_chunk if end_offset < len(self.text_block): highlighted_chunk = '%s...' % highlighted_chunk return highlighted_chunk
def test_trim(self): """Checks the output of trim() against selected test cases.""" str0 = "34\u2345\u3456" str1 = "\u1234\u2345\u3456" str2 = "\u2345\u3456\\" str3 = "34\u2345\u3456 \u" str4 = "34\u2345\u3456\u3" str5 = "34\u2345\u3456\u34" str6 = "34\u2345\u3456\u345" str7 = "" self.assertTrue(trim.is_trimmed(trim.trim(str0))) self.assertTrue(trim.is_trimmed(trim.trim(str1))) self.assertTrue(trim.is_trimmed(trim.trim(str2))) self.assertTrue(trim.is_trimmed(trim.trim(str3))) self.assertTrue(trim.is_trimmed(trim.trim(str4))) self.assertTrue(trim.is_trimmed(trim.trim(str5))) self.assertTrue(trim.is_trimmed(trim.trim(str6))) self.assertTrue(trim.is_trimmed(trim.trim(str7)))