def escape_html_characters(content): """ Remove HTML characters that shouldn't be indexed using ElasticSearch indexer This method is complementary to html_to_text method found in xmodule/annotator_mixin.py Args: content (str): variable to escape html characters from Returns: content (str): content ready to be index by ElasticSearch """ # Removing HTML comments return re.sub( r"<!--.*-->", "", # Removing HTML CDATA re.sub( r"<!\[CDATA\[.*\]\]>", "", # Removing HTML-encoded non-breaking space characters re.sub( r"(\s| |//)+", " ", html_to_text(content) ) ) )
def strip_html_content_to_text(html_content): """ Gets only the textual part for html content - useful for building text to be searched """ # Removing HTML-encoded non-breaking space characters text_content = re.sub(r"(\s| |//)+", " ", html_to_text(html_content)) # Removing HTML CDATA text_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", text_content) # Removing HTML comments text_content = re.sub(r"<!--.*-->", "", text_content) return text_content
def __init__(self, *args, **kwargs): super(ImageAnnotationModule, self).__init__(*args, **kwargs) xmltree = etree.fromstring(self.data) self.instructions = self._extract_instructions(xmltree) self.openseadragonjson = html_to_text(etree.tostring(xmltree.find('json'), encoding='unicode')) self.user = "" if self.runtime.get_real_user is not None: self.user = self.runtime.get_real_user(self.runtime.anonymous_student_id).email
def index_dictionary(self): xblock_body = super(HtmlDescriptor, self).index_dictionary() # Removing HTML-encoded non-breaking space characters html_content = re.sub(r"(\s| |//)+", " ", html_to_text(self.data)) # Removing HTML CDATA html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content) # Removing HTML comments html_content = re.sub(r"<!--.*-->", "", html_content) html_body = {"html_content": html_content, "display_name": self.display_name} if "content" in xblock_body: xblock_body["content"].update(html_body) else: xblock_body["content"] = html_body xblock_body["content_type"] = "Text" return xblock_body
def __init__(self, *args, **kwargs): super(ImageAnnotationModule, self).__init__(*args, **kwargs) xmltree = etree.fromstring(self.data) self.instructions = self._extract_instructions(xmltree) self.openseadragonjson = html_to_text(etree.tostring(xmltree.find('json'), encoding='unicode')) self.user_email = "" self.is_course_staff = False if self.runtime.get_user_role() in ['instructor', 'staff']: self.is_course_staff = True if self.runtime.get_real_user is not None: try: self.user_email = self.runtime.get_real_user(self.runtime.anonymous_student_id).email except Exception: # pylint: disable=broad-except self.user_email = _("No email address found.")
def index_dictionary(self): xblock_body = super(HtmlDescriptor, self).index_dictionary() # Removing HTML-encoded non-breaking space characters html_content = re.sub(r"(\s| |//)+", " ", html_to_text(self.data)) # Removing HTML CDATA html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content) # Removing HTML comments html_content = re.sub(r"<!--.*-->", "", html_content) html_body = { "html_content": html_content, "display_name": self.display_name, } if "content" in xblock_body: xblock_body["content"].update(html_body) else: xblock_body["content"] = html_body xblock_body["content_type"] = "Text" return xblock_body
def test_html_to_text(self): expectedtext = "Testing here and not bolded here" result = html_to_text(self.sample_html) assert expectedtext == result
def test_html_to_text(self): expectedtext = "Testing here and not bolded here" result = html_to_text(self.sample_html) self.assertEqual(expectedtext, result)