def replace_at(self, original, replacement, locations): """ Replace the occurrences of original at all the locations with replacement. """ locations.sort() self.text = LocationReplace().location_replace_text( self.text, original, replacement, locations) self.unescape_text()
def test_location_replace_text(self): lr = LocationReplace() replaced = lr.location_replace_text('Bunch of as as as', 'as', '<sub>as</sub>', [0, 2]) self.assertEqual('Bunch of <sub>as</sub> as <sub>as</sub>', replaced) lr = LocationReplace() replaced = lr.location_replace_text('Bunch of as as as', 'as', '<sub>b</sub>', [0, 2]) self.assertEqual('Bunch of <sub>b</sub> as <sub>b</sub>', replaced) lr = LocationReplace() replaced = lr.location_replace_text('Bunch of a_{s} a_{s} a_{s}', 'a_{s}', 'a<sub>s</sub>', [0, 2]) self.assertEqual('Bunch of a<sub>s</sub> a_{s} a<sub>s</sub>', replaced)
def inline_replacements(self, text_index, original_text): """Apply multiple inline layers to given text (e.g. links, highlighting, etc.)""" layer_pairs = self.apply_layer(original_text, text_index) # convert from offset-based to a search and replace layer. for original, replacement, offset in layer_pairs: offset_locations = LocationReplace.find_all_offsets( original, original_text) locations = [offset_locations.index(offset)] yield Replacement(original, replacement, locations)
def get_layer_pairs(self, text_index, original_text): layer_pairs = [] for layer in self.layers.values(): layer_pairs += list(layer.apply_layer(original_text, text_index)) # convert from offset-based to a search and replace layer. layer_elements = [] for o, r, offset in layer_pairs: offset_locations = LocationReplace.find_all_offsets(o, original_text) locations = [offset_locations.index(offset)] layer_elements.append((o, r, locations)) return layer_elements
def get_layer_pairs(self, text_index, original_text): layer_pairs = [] for layer in self.layers.values(): applied = layer.apply_layer(original_text, text_index) if applied: layer_pairs += applied #convert from offset-based to a search and replace layer. layer_elements = [] for o, r, offset in layer_pairs: offset_locations = LocationReplace.find_all_offsets(o, original_text) try: locations = [offset_locations.index(offset)] layer_elements.append((o, r, locations)) except Exception as ex: logging.info('{0!s}'.format(ex)) logging.info('Problem interpolating offsets: {0}, {1}'.format(offset_locations, offset)) return layer_elements
def get_layer_pairs(self, text_index, original_text): layer_pairs = [] for layer in self.layers.values(): applied = layer.apply_layer(original_text, text_index) if applied: layer_pairs += applied #convert from offset-based to a search and replace layer. layer_elements = [] for o, r, offset in layer_pairs: offset_locations = LocationReplace.find_all_offsets( o, original_text) try: locations = [offset_locations.index(offset)] layer_elements.append((o, r, locations)) except Exception as ex: logging.info('{0!s}'.format(ex)) logging.info('Problem interpolating offsets: {0}, {1}'.format( offset_locations, offset)) return layer_elements
def location_replace(self, xml_node, original, replacement, locations): LocationReplace().location_replace(xml_node, original, replacement, locations)
class LayersApplier(object): """ Most layers replace content. We try to do this intelligently here, so that layers don't step over each other. """ def __init__(self): self.queue = PriorityQueue() self.text = None def enqueue_from_list(self, elements_list): for le in elements_list: self.enqueue(le) def enqueue(self, layer_element): original, replacement, locations = layer_element priority = len(original) item = (original, replacement, locations) self.queue.put((-priority, item)) def replace(self, xml_node, original, replacement): """ Helper method for replace_all(), this actually does the replace. This deals with XML nodes, not nodes in the tree. """ if xml_node.text: xml_node.text = xml_node.text.replace(original, replacement) for c in xml_node.getchildren(): self.replace(c, original, replacement) if xml_node.tail: xml_node.tail = xml_node.tail.replace(original, replacement) return xml_node def location_replace(self, xml_node, original, replacement, locations): LocationReplace().location_replace(xml_node, original, replacement, locations) def unescape_text(self): """ Because of the way we do replace_all(), we need to unescape HTML entities. """ self.text = HTMLParser().unescape(self.text) def replace_all(self, original, replacement): """ Replace all occurrences of original with replacement. This is HTML aware. """ htmlized = html.fragment_fromstring(self.text, create_parent='div') htmlized = self.replace(htmlized, original, replacement) self.text = html.tostring(htmlized) self.text = self.text.replace("<div>", "", 1) self.text = self.text[:self.text.rfind("</div>")] self.unescape_text() def replace_at(self, original, replacement, locations): """ Replace the occurrences of original at all the locations with replacement. """ locations.sort() self.text = LocationReplace().location_replace_text( self.text, original, replacement, locations) self.unescape_text() def apply_layers(self, original_text): self.text = original_text while not self.queue.empty(): priority, layer_element = self.queue.get() original, replacement, locations = layer_element if not locations: self.replace_all(original, replacement) else: self.replace_at(original, replacement, locations) return self.text
def unescape_text(self): """ Because of the way we do replace_all(), we need to unescape HTML entities. """ self.text = HTMLParser().unescape(self.text)
def test_update_offsets_html(self): lr = LocationReplace() lr.update_offsets("a", "This is a test. It is only a test") self.assertEqual(lr.offsets, {0: (8, 9), 1: (27, 28)}) lr.update_offsets("a", "This is a test. <a href='something'>link</a>") self.assertEqual(lr.offsets, {0: (8, 9)})