Пример #1
0
def escape_html_characters(content):
    """
    Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
    This method is complementary to html_to_text method found in xmodule/annotator_mixin.py

    Args:
        content (str): variable to escape html characters from

    Returns:
        content (str): content ready to be index by ElasticSearch

    """

    # Removing HTML comments
    return re.sub(
        r"<!--.*-->",
        "",
        # Removing HTML CDATA
        re.sub(
            r"<!\[CDATA\[.*\]\]>",
            "",
            # Removing HTML-encoded non-breaking space characters
            re.sub(
                r"(\s|&nbsp;|//)+",
                " ",
                html_to_text(content)
            )
        )
    )
Пример #2
0
def strip_html_content_to_text(html_content):
    """ Gets only the textual part for html content - useful for building text to be searched """
    # Removing HTML-encoded non-breaking space characters
    text_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(html_content))
    # Removing HTML CDATA
    text_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", text_content)
    # Removing HTML comments
    text_content = re.sub(r"<!--.*-->", "", text_content)

    return text_content
    def __init__(self, *args, **kwargs):
        super(ImageAnnotationModule, self).__init__(*args, **kwargs)

        xmltree = etree.fromstring(self.data)

        self.instructions = self._extract_instructions(xmltree)
        self.openseadragonjson = html_to_text(etree.tostring(xmltree.find('json'), encoding='unicode'))
        self.user = ""
        if self.runtime.get_real_user is not None:
            self.user = self.runtime.get_real_user(self.runtime.anonymous_student_id).email
Пример #4
0
def strip_html_content_to_text(html_content):
    """ Gets only the textual part for html content - useful for building text to be searched """
    # Removing HTML-encoded non-breaking space characters
    text_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(html_content))
    # Removing HTML CDATA
    text_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", text_content)
    # Removing HTML comments
    text_content = re.sub(r"<!--.*-->", "", text_content)

    return text_content
Пример #5
0
 def index_dictionary(self):
     xblock_body = super(HtmlDescriptor, self).index_dictionary()
     # Removing HTML-encoded non-breaking space characters
     html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(self.data))
     # Removing HTML CDATA
     html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
     # Removing HTML comments
     html_content = re.sub(r"<!--.*-->", "", html_content)
     html_body = {"html_content": html_content, "display_name": self.display_name}
     if "content" in xblock_body:
         xblock_body["content"].update(html_body)
     else:
         xblock_body["content"] = html_body
     xblock_body["content_type"] = "Text"
     return xblock_body
    def __init__(self, *args, **kwargs):
        super(ImageAnnotationModule, self).__init__(*args, **kwargs)

        xmltree = etree.fromstring(self.data)

        self.instructions = self._extract_instructions(xmltree)
        self.openseadragonjson = html_to_text(etree.tostring(xmltree.find('json'), encoding='unicode'))
        self.user_email = ""
        self.is_course_staff = False
        if self.runtime.get_user_role() in ['instructor', 'staff']:
            self.is_course_staff = True
        if self.runtime.get_real_user is not None:
            try:
                self.user_email = self.runtime.get_real_user(self.runtime.anonymous_student_id).email
            except Exception:  # pylint: disable=broad-except
                self.user_email = _("No email address found.")
    def __init__(self, *args, **kwargs):
        super(ImageAnnotationModule, self).__init__(*args, **kwargs)

        xmltree = etree.fromstring(self.data)

        self.instructions = self._extract_instructions(xmltree)
        self.openseadragonjson = html_to_text(etree.tostring(xmltree.find('json'), encoding='unicode'))
        self.user_email = ""
        self.is_course_staff = False
        if self.runtime.get_user_role() in ['instructor', 'staff']:
            self.is_course_staff = True
        if self.runtime.get_real_user is not None:
            try:
                self.user_email = self.runtime.get_real_user(self.runtime.anonymous_student_id).email
            except Exception:  # pylint: disable=broad-except
                self.user_email = _("No email address found.")
Пример #8
0
 def index_dictionary(self):
     xblock_body = super(HtmlDescriptor, self).index_dictionary()
     # Removing HTML-encoded non-breaking space characters
     html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(self.data))
     # Removing HTML CDATA
     html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
     # Removing HTML comments
     html_content = re.sub(r"<!--.*-->", "", html_content)
     html_body = {
         "html_content": html_content,
         "display_name": self.display_name,
     }
     if "content" in xblock_body:
         xblock_body["content"].update(html_body)
     else:
         xblock_body["content"] = html_body
     xblock_body["content_type"] = "Text"
     return xblock_body
 def test_html_to_text(self):
     expectedtext = "Testing here and not bolded here"
     result = html_to_text(self.sample_html)
     assert expectedtext == result
Пример #10
0
 def test_html_to_text(self):
     expectedtext = "Testing here and not bolded here"
     result = html_to_text(self.sample_html)
     self.assertEqual(expectedtext, result)