def get_sentence(element, element_text, text_context, xtext=XTEXT): indexes = find_list(text_context, element_text) size = len(indexes) if size == 0: return '' elif size == 1: return find_sentence(text_context, indexes[0], indexes[0] + len(element_text)) else: parent = element.getparent() child_index_in_parent = 0 for child in parent: if child == element: break else: temp_text = normalize(xtext(child)) # We have encountered a child that has the same text, # so the first index is not the good one. if temp_text.find(element_text) != -1: child_index_in_parent += 1 if child_index_in_parent < size: return find_sentence( text_context, indexes[child_index_in_parent], indexes[child_index_in_parent] + len(element_text)) else: # Something went wrong. return find_sentence(element_text, indexes[0], indexes[0] + len(element_text))
def get_sentence(element, element_text, text_context, xtext=XTEXT): indexes = find_list(text_context, element_text) size = len(indexes) if size == 0: return "" elif size == 1: return find_sentence(text_context, indexes[0], indexes[0] + len(element_text)) else: parent = element.getparent() child_index_in_parent = 0 for child in parent: if child == element: break else: temp_text = normalize(xtext(child)) # We have encountered a child that has the same text, # so the first index is not the good one. if temp_text.find(element_text) != -1: child_index_in_parent += 1 if child_index_in_parent < size: return find_sentence( text_context, indexes[child_index_in_parent], indexes[child_index_in_parent] + len(element_text) ) else: # Something went wrong. return find_sentence(element_text, indexes[0], indexes[0] + len(element_text))
def get_text(self, element, complex_text=False): '''Computes the text of this element by creating a deepcopy of the element, removing the bad children, getting the text representation. This is quite inefficient, but it's the best way to get a good representation of the text in a hierarchical section.''' new_element = deepcopy(element) bad_elements = self.first_filter(new_element) for bad_element in bad_elements: if bad_element in new_element: new_element.remove(bad_element) if complex_text: text_parts = [] get_recursive_text(new_element, text_parts) else: text_parts = new_element.xpath('.//text()') text = '\n'.join(text_parts) return normalize(text)
def get_text(self, element, complex_text=False): """Computes the text of this element by creating a deepcopy of the element, removing the bad children, getting the text representation. This is quite inefficient, but it's the best way to get a good representation of the text in a hierarchical section.""" new_element = deepcopy(element) bad_elements = self.first_filter(new_element) for bad_element in bad_elements: if bad_element in new_element: new_element.remove(bad_element) if complex_text: text_parts = [] get_recursive_text(new_element, text_parts) else: text_parts = new_element.xpath(".//text()") text = "\n".join(text_parts) return normalize(text)
def get_text_from_parent(self, parent, index=0, complex_text=False): text = '' elem = self.get_element(parent, index) if elem is not None: text = self.get_text(elem, complex_text) return normalize(text)
def get_text(self, element): return normalize(self.xtext(element))
def get_text_from_parent(self, parent, index=0): text = '' elem = self.get_element(parent, index) if elem is not None: text = self.xtext(elem) return normalize(text)
def get_text(element, xtext=XTEXT): return normalize(xtext(element))
def get_text_from_parent(self, parent, index=0, complex_text=False): text = "" elem = self.get_element(parent, index) if elem is not None: text = self.get_text(elem, complex_text) return normalize(text)
def get_text_from_parent(self, parent, index=0): text = "" elem = self.get_element(parent, index) if elem is not None: text = self.xtext(elem) return normalize(text)