Python HTMLWasher 예제들, invenio.htmlutils.HTMLWasher Python 예제들

예제 #1

0

파일 보기

def escape_email_quoted_text(text, indent_txt='>>', linebreak_txt='\n'):
    """Escape text using an email-like indenting rule.
    As an example, this text:

    >>Brave Sir Robin ran away...
    <img src="malicious_script />*No!*
    >>bravely ran away away...
    I didn't!*<script>malicious code</script>
    >>When danger reared its ugly head, he bravely turned his tail and fled.
    <form onload="malicious"></form>*I never did!*

    will be escaped like this:
    >>Brave Sir Robin ran away...
    &lt;img src="malicious_script /&gt;*No!*
    >>bravely ran away away...
    I didn't!*&lt;script&gt;malicious code&lt;/script&gt;
    >>When danger reared its ugly head, he bravely turned his tail and fled.
    &lt;form onload="malicious"&gt;&lt;/form&gt;*I never did!*
    """
    washer = HTMLWasher()
    lines = text.split(linebreak_txt)
    output = ''
    for line in lines:
        line = line.strip()
        nb_indent = 0
        while True:
            if line.startswith(indent_txt):
                nb_indent += 1
                line = line[len(indent_txt):]
            else:
                break
        output += (nb_indent * indent_txt) + washer.wash(
            line, render_unallowed_tags=True) + linebreak_txt
        nb_indent = 0
    return output[:-1]

예제 #2

0

파일 보기

파일: webmessage_mailutils.py 프로젝트: flannery/invenio-flannery

def escape_email_quoted_text(text, indent_txt='>>', linebreak_txt='\n'):
    """Escape text using an email-like indenting rule.
    As an example, this text:

    >>Brave Sir Robin ran away...
    <img src="malicious_script />*No!*
    >>bravely ran away away...
    I didn't!*<script>malicious code</script>
    >>When danger reared its ugly head, he bravely turned his tail and fled.
    <form onload="malicious"></form>*I never did!*

    will be escaped like this:
    >>Brave Sir Robin ran away...
    &lt;img src="malicious_script /&gt;*No!*
    >>bravely ran away away...
    I didn't!*&lt;script&gt;malicious code&lt;/script&gt;
    >>When danger reared its ugly head, he bravely turned his tail and fled.
    &lt;form onload="malicious"&gt;&lt;/form&gt;*I never did!*
    """
    washer = HTMLWasher()
    lines = text.split(linebreak_txt)
    output = ''
    for line in lines:
        line = line.strip()
        nb_indent = 0
        while True:
            if line.startswith(indent_txt):
                nb_indent += 1
                line = line[len(indent_txt):]
            else:
                break
        output += (nb_indent * indent_txt) + washer.wash(line, render_unallowed_tags=True) + linebreak_txt
        nb_indent = 0
    return output[:-1]

예제 #3

0

파일 보기

파일: htmlutils_unit_tests.py 프로젝트: chezjohnny/invenio

class XSSEscapingTest(unittest.TestCase):
    """Test functions related to the prevention of XSS attacks."""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        unittest.TestCase.__init__(self, methodName)

    def test_forbidden_formatting_tags(self):
        """htmlutils - washing of tags altering formatting of a page (e.g. </html>)"""
        test_str = """</html></body></pre>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '')
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         '&lt;/html&gt;&lt;/body&gt;&lt;/pre&gt;')

    def test_forbidden_script_tags(self):
        """htmlutils - washing of tags defining scripts (e.g. <script>)"""
        test_str = """<script>malicious_function();</script>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '')
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         '&lt;script&gt;malicious_function();&lt;/script&gt;')

    def test_forbidden_attributes(self):
        """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)"""
        # onload
        test_str = """<p onload="javascript:malicious_functtion();">"""
        self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')
        # tricky: css calling a javascript
        test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">"""
        self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')

    def test_fake_url(self):
        """htmlutils - washing of fake URLs which execute scripts"""
        test_str = """<a href="javascript:malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')
        # Pirates could encode ascii values, or use uppercase letters...
        test_str = """<a href="&#106;a&#118;asCRi&#112;t:malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')
        # MSIE treats 'java\ns\ncript:' the same way as 'javascript:'
        # Here we test with:
        # j
        #     avas
        #   crIPt :
        test_str = """<a href="&#106;\n    a&#118;as\n  crI&#80;t :malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')

예제 #4

0

파일 보기

class XSSEscapingTest(InvenioTestCase):
    """Test functions related to the prevention of XSS attacks."""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        InvenioTestCase.__init__(self, methodName)

    def test_forbidden_formatting_tags(self):
        """htmlutils - washing of tags altering formatting of a page (e.g. </html>)"""
        test_str = """</html></body></pre>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '')
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         '&lt;/html&gt;&lt;/body&gt;&lt;/pre&gt;')

    def test_forbidden_script_tags(self):
        """htmlutils - washing of tags defining scripts (e.g. <script>)"""
        test_str = """<script>malicious_function();</script>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '')
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         '&lt;script&gt;malicious_function();&lt;/script&gt;')

    def test_forbidden_attributes(self):
        """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)"""
        # onload
        test_str = """<p onload="javascript:malicious_functtion();">"""
        self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')
        # tricky: css calling a javascript
        test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">"""
        self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')

    def test_fake_url(self):
        """htmlutils - washing of fake URLs which execute scripts"""
        test_str = """<a href="javascript:malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')
        # Pirates could encode ascii values, or use uppercase letters...
        test_str = """<a href="&#106;a&#118;asCRi&#112;t:malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')
        # MSIE treats 'java\ns\ncript:' the same way as 'javascript:'
        # Here we test with:
        # j
        #     avas
        #   crIPt :
        test_str = """<a href="&#106;\n    a&#118;as\n  crI&#80;t :malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')

예제 #5

0

파일 보기

class HTMLAutomaticLinksTransformation(InvenioTestCase):
    """Test functions related to transforming links into HTML context"""
    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        InvenioTestCase.__init__(self, methodName)

    def test_transform_link(self):
        """htmlutils - transforming a link"""
        body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es'
        body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>'
        self.assertEqual(
            self.washer.wash(html_buffer=body_input,
                             automatic_link_transformation=True),
            body_expected)

    def test_transform_several_links(self):
        """htmlutils - transforming several links"""
        body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News'
        body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News'
        self.assertEqual(
            self.washer.wash(html_buffer=body_input,
                             automatic_link_transformation=True),
            body_expected)

    def test_transform_just_valid_links(self):
        """htmlutils - transforming just valid links"""
        body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News'
        body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News'
        self.assertEqual(
            self.washer.wash(html_buffer=body_input,
                             automatic_link_transformation=True),
            body_expected)

    def test_not_transform_link(self):
        """htmlutils - not transforming a link"""
        body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
        body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
        self.assertEqual(
            self.washer.wash(html_buffer=body_input,
                             automatic_link_transformation=True),
            body_expected)

예제 #6

0

파일 보기

파일: htmlutils_unit_tests.py 프로젝트: chezjohnny/invenio

class HTMLAutomaticLinksTransformation(unittest.TestCase):
    """Test functions related to transforming links into HTML context"""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        unittest.TestCase.__init__(self, methodName)

    def test_transform_link(self):
        """htmlutils - transforming a link"""
        body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es'
        body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

    def test_transform_several_links(self):
        """htmlutils - transforming several links"""
        body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News'
        body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

    def test_transform_just_valid_links(self):
        """htmlutils - transforming just valid links"""
        body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News'
        body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

    def test_not_transform_link(self):
        """htmlutils - not transforming a link"""
        body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
        body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

예제 #7

0

파일 보기

파일: bfe_notes.py 프로젝트: GRArmstrong/inspire-old

def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '):
    """
    Displays notes (various note fields)

    @param note_prefix: a prefix before each group of notes
    @param note_suffix: a suffix after each group of notes
    @param separator: a separator between notes of a group
    """
    notes = []
    washer = HTMLWasher()
    wash_and_join = lambda x: separator.join([washer.wash(item, automatic_link_transformation=True) for item in x])

    # Get values from certain fields, wash them (so all links become clickable),
    # join using separator and add to a list
    if bfo.fields('500__a'):
        notes.append(wash_and_join(bfo.fields('500__a')))

    if len(notes) > 0:
        # Split all list elements and add prefixes and suffixes
        notes = [note_prefix + x + note_suffix
                 for x in notes]
        return_notes = "".join(notes)
        return return_notes

예제 #8

0

파일 보기

def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '):
    """
    Displays notes (various note fields)

    @param note_prefix: a prefix before each group of notes
    @param note_suffix: a suffix after each group of notes
    @param separator: a separator between notes of a group
    """
    notes = []
    washer = HTMLWasher()
    wash_and_join = lambda x: separator.join(
        [washer.wash(item, automatic_link_transformation=True) for item in x])

    # Get values from certain fields, wash them (so all links become clickable),
    # join using separator and add to a list
    if bfo.fields('500__a'):
        notes.append(wash_and_join(bfo.fields('500__a')))

    if len(notes) > 0:
        # Split all list elements and add prefixes and suffixes
        notes = [note_prefix + x + note_suffix for x in notes]
        return_notes = "".join(notes)
        return return_notes

예제 #9

0

파일 보기

파일: bfe_notes.py 프로젝트: turtle321/inspire

def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '):
    """
    Displays notes (various note fields)

    @param note_prefix: a prefix before each group of notes
    @param note_suffix: a suffix after each group of notes
    @param separator: a separator between notes of a group
    """
    notes = []
    washer = HTMLWasher()

    # Get values from certain fields, wash them (so all links become clickable),
    # join using separator and add to a list
    for field in bfo.fields('500__a'):
        field = washer.wash(field.replace("&", "&amp;"),
                            automatic_link_transformation=True)
        notes.append(field)

    if len(notes) > 0:
        # Split all list elements and add prefixes and suffixes
        notes = [note_prefix + x + note_suffix
                 for x in notes]
        return_notes = "".join(notes)
        return return_notes

예제 #10

0

파일 보기

class HTMLWashingTest(InvenioTestCase):
    """Test functions related to general washing of HTML source"""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        InvenioTestCase.__init__(self, methodName)

    def test_wash_html(self):
        """htmlutils - washing HTML tags"""

        # Simple test case
        test_str = 'Spam and <b><blink>eggs</blink></b>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'Spam and <b>eggs</b>')

        # Show 'escaped' tags
        test_str = 'Spam and <b><blink>eggs</blink></b>'
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         'Spam and <b>&lt;blink&gt;eggs&lt;/blink&gt;</b>')

        # Keep entity and character references
        test_str = '<b> a &lt; b &gt; c </b> &#247;'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<b> a &lt; b &gt; c </b> &#247;')

        # Remove content of <script> tags
        test_str = '<script type="text/javacript">alert("foo")</script>bar'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'bar')
        test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'bar')

        # Remove content of <style> tags
        test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'styled text')
        test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'styled text')

예제 #11

0

파일 보기

파일: htmlutils_unit_tests.py 프로젝트: chezjohnny/invenio

class HTMLWashingTest(unittest.TestCase):
    """Test functions related to general washing of HTML source"""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        unittest.TestCase.__init__(self, methodName)

    def test_wash_html(self):
        """htmlutils - washing HTML tags"""

        # Simple test case
        test_str = 'Spam and <b><blink>eggs</blink></b>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'Spam and <b>eggs</b>')

        # Show 'escaped' tags
        test_str = 'Spam and <b><blink>eggs</blink></b>'
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         'Spam and <b>&lt;blink&gt;eggs&lt;/blink&gt;</b>')

        # Keep entity and character references
        test_str = '<b> a &lt; b &gt; c </b> &#247;'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<b> a &lt; b &gt; c </b> &#247;')

        # Remove content of <script> tags
        test_str = '<script type="text/javacript">alert("foo")</script>bar'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'bar')
        test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'bar')

        # Remove content of <style> tags
        test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'styled text')
        test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'styled text')

예제 #12

0

파일 보기

파일: htmlutils_unit_tests.py 프로젝트: chezjohnny/invenio

 def __init__(self, methodName='test'):
     self.washer = HTMLWasher()
     unittest.TestCase.__init__(self, methodName)

예제 #13

0

파일 보기

파일: webmessage_mailutils.py 프로젝트: chokribr/inveniotest

def email_quoted_txt2html(text,
                          tabs_before=0,
                          indent_txt='>>',
                          linebreak_txt="\n",
                          indent_html=('<div class="commentbox">', "</div>"),
                          linebreak_html='<br/>',
                          indent_block=True):
    """
    Takes a typical mail quoted text, e.g.::
        hello,
        you told me:
        >> Your mother was a hamster and your father smelt of elderberries
        I must tell you that I'm not convinced. Then in this discussion:
        >>>> Is there someone else up there we could talk to?
        >> No. Now, go away, or I shall taunt you a second time-a!
        I think we're not going to be friends!

    and return an html formatted output, e.g.::
        hello,<br/>
        you told me:<br/>
        <div>
          Your mother was a hamster and your father smelt of elderberries
        </div>
        I must tell you that I'm not convinced. Then in this discussion:
        <div>
          <div>
            Is there someone else up there we could talk to?
          </div>
          No. Now, go away, or I shall taunt you a second time-a!
        </div>
        I think we're not going to be friends!

    The behaviour is different when C{indent_block} is C{True} or C{False}.
    When C{True} the when C{indent_html} is only added at each change of
    level of indentation, while it is added for each line when C{False}.
    For eg::
        >> a
        >> b
        >>>> c

    would result in (if C{True})::
        <div class="commentbox">
            a<br/>
            b<br/>
            <div class="commentbox">
                c<br/>
            </div>
        </div>

    or would be (if C{False})::
        <div class="commentbox"> a</div><br/>
        <div class="commentbox"> b</div><br/>
        <div class="commentbox"><div class="commentbox"> c</div></div><br/>

    @param text: the text in quoted format
    @param tabs_before: number of tabulations before each line
    @param indent_txt: quote separator in email (default:'>>')
    @param linebreak_txt: line separator in email
    @param indent_html: tuple of (opening, closing) html tags.
                        default: ('<div class="commentbox">', "</div>")
    @param linebreak_html: line separator in html (default: '<br/>')
    @param indent_block: if indentation should be done per 'block'
                         i.e. only at changes of indentation level
                         (+1, -1) or at each line.
    @return: string containing html formatted output
    """
    washer = HTMLWasher()

    final_body = ""
    nb_indent = 0
    text = text.strip('\n')
    lines = text.split(linebreak_txt)
    for line in lines:
        new_nb_indent = 0
        while True:
            if line.startswith(indent_txt):
                new_nb_indent += 1
                line = line[len(indent_txt):]
            else:
                break
        if indent_block:
            if (new_nb_indent > nb_indent):
                for dummy in range(nb_indent, new_nb_indent):
                    final_body += tabs_before*"\t" + indent_html[0] + "\n"
                    tabs_before += 1
            elif (new_nb_indent < nb_indent):
                for dummy in range(new_nb_indent, nb_indent):
                    tabs_before -= 1
                    final_body += (tabs_before)*"\t" + indent_html[1] + "\n"
            else:
                final_body += (tabs_before)*"\t"
        else:
            final_body += tabs_before*"\t" + new_nb_indent * indent_html[0]
        try:
            line = washer.wash(line)
        except HTMLParseError:
            # Line contained something like "foo<bar"
            line = cgi.escape(line)
        if indent_block:
            final_body += tabs_before*"\t"
        final_body += line
        if not indent_block:
            final_body += new_nb_indent * indent_html[1]
        final_body += linebreak_html + "\n"
        nb_indent = new_nb_indent
    if indent_block:
        for dummy in range(0, nb_indent):
            tabs_before -= 1
            final_body += (tabs_before)*"\t" + "</div>\n"
    return final_body

예제 #14

0

파일 보기

파일: webstyle_templates_scoap3.py 프로젝트: jalavik/scoap3

    def tmpl_pageheader(self,
                        req,
                        ln=CFG_SITE_LANG,
                        headertitle="",
                        description="",
                        keywords="",
                        userinfobox="",
                        useractivities_menu="",
                        adminactivities_menu="",
                        navtrailbox="",
                        pageheaderadd="",
                        uid=0,
                        secure_page_p=0,
                        navmenuid="admin",
                        metaheaderadd="",
                        rssurl=CFG_BASE_URL + "/rss",
                        body_css_classes=None):
        """Creates a page header

           Parameters:

          - 'ln' *string* - The language to display

          - 'headertitle' *string* - the title of the HTML page, not yet escaped for HTML

          - 'description' *string* - description goes to the metadata in the header of the HTML page,
                                     not yet escaped for HTML

          - 'keywords' *string* - keywords goes to the metadata in the header of the HTML page,
                                  not yet escaped for HTML

          - 'userinfobox' *string* - the HTML code for the user information box

          - 'useractivities_menu' *string* - the HTML code for the user activities menu

          - 'adminactivities_menu' *string* - the HTML code for the admin activities menu

          - 'navtrailbox' *string* - the HTML code for the navigation trail box

          - 'pageheaderadd' *string* - additional page header HTML code

          - 'uid' *int* - user ID

          - 'secure_page_p' *int* (0 or 1) - are we to use HTTPS friendly page elements or not?

          - 'navmenuid' *string* - the id of the navigation item to highlight for this page

          - 'metaheaderadd' *string* - list of further tags to add to the <HEAD></HEAD> part of the page

          - 'rssurl' *string* - the url of the RSS feed for this page

          - 'body_css_classes' *list* - list of classes to add to the body tag

           Output:

          - HTML code of the page headers
        """
        # Including HEPData headers ( Ugly hack but no obvious way to avoid this ...)
        if CFG_INSPIRE_SITE:
            hepDataAdditions = """<script type="text/javascript" src="%s/js/hepdata.js"></script>""" \
            % (CFG_BASE_URL, )
            hepDataAdditions += """<link rel="stylesheet" href="%s/img/hepdata.css" type="text/css" />""" \
            % (CFG_BASE_URL, )
        else:
            hepDataAdditions = ""
        # load the right message language
        _ = gettext_set_language(ln)

        if body_css_classes is None:
            body_css_classes = []
        body_css_classes.append(navmenuid)

        uri = req.unparsed_uri
        headerLinkbackTrackbackLink = ''
        if CFG_WEBLINKBACK_TRACKBACK_ENABLED:
            from invenio.weblinkback_templates import get_trackback_auto_discovery_tag
            # Embed a link in the header to subscribe trackbacks
            # TODO: This hack must be replaced with the introduction of the new web framework
            recordIndexInURI = uri.find('/' + CFG_SITE_RECORD + '/')
            # substring found --> offer trackback link in header
            if recordIndexInURI != -1:
                recid = uri[recordIndexInURI:len(uri)].split('/')[2].split(
                    "?")[0]  #recid might end with ? for journal records
                headerLinkbackTrackbackLink = get_trackback_auto_discovery_tag(
                    recid)

        if CFG_WEBSTYLE_INSPECT_TEMPLATES:
            inspect_templates_message = '''
<table width="100%%" cellspacing="0" cellpadding="2" border="0">
<tr bgcolor="#aa0000">
<td width="100%%">
<font color="#ffffff">
<strong>
<small>
CFG_WEBSTYLE_INSPECT_TEMPLATES debugging mode is enabled.  Please
hover your mouse pointer over any region on the page to see which
template function generated it.
</small>
</strong>
</font>
</td>
</tr>
</table>
'''
        else:
            inspect_templates_message = ""

        sitename = CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME)
        if headertitle == sitename:
            pageheadertitle = headertitle
        else:
            pageheadertitle = headertitle + ' - ' + sitename

        metabase = ""
        stripped_url = CFG_SITE_URL.replace("://", "")
        if not CFG_BASE_URL and '/' in stripped_url:
            metabase = "<base href='%s'>" % (CFG_SITE_URL, )

        out = """\
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="%(ln_iso_639_a)s" xml:lang="%(ln_iso_639_a)s" xmlns:og="http://opengraphprotocol.org/schema/" >
<head>
 <title>%(pageheadertitle)s</title>
 %(metabase)s
 <link rev="made" href="mailto:%(sitesupportemail)s" />
 <link rel="stylesheet" href="%(cssurl)s/img/invenio%(cssskin)s.css" type="text/css" />
 <!--[if lt IE 8]>
    <link rel="stylesheet" type="text/css" href="%(cssurl)s/img/invenio%(cssskin)s-ie7.css" />
 <![endif]-->
 <!--[if gt IE 8]>
    <style type="text/css">div.restrictedflag {filter:none;}</style>
 <![endif]-->
 %(canonical_and_alternate_urls)s
 <!-- <link rel="alternate" type="application/rss+xml" title="%(sitename)s RSS" href="%(rssurl)s" /> -->
 <link rel="search" type="application/opensearchdescription+xml" href="%(siteurl)s/opensearchdescription" title="%(sitename)s" />
 <link rel="unapi-server" type="application/xml" title="unAPI" href="%(unAPIurl)s" />
 <link rel="icon" href="/img/favicon.ico" type="image/x-icon">
 <link rel="shortcut icon" href="/img/favicon.ico" type="image/x-icon">
 %(linkbackTrackbackLink)s
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 <meta http-equiv="Content-Language" content="%(ln)s" />
 <meta name="description" content="%(description)s" />
 <meta name="keywords" content="%(keywords)s" />
 <meta name="google-site-verification" content="mLqufkdPNxUHXFW4obCfN5NJXr4sD_SlnvsOla7RZAE" />
 <meta name="msvalidate.01" content="EA9805F0F62E4FF22B98853713964B28" />
 <script type="text/javascript" src="%(cssurl)s/js/jquery.min.js"></script>
 %(hepDataAdditions)s
 %(metaheaderadd)s
</head>
<body%(body_css_classes)s lang="%(ln_iso_639_a)s"%(rtl_direction)s>
<div class="pageheader">
%(inspect_templates_message)s
<!-- replaced page header -->
<div class="headerlogo">
<table class="headerbox" cellspacing="0">
 <tr>
  <td align="right" valign="top" colspan="12">
  <div class="headerboxbodylogo">
   <a href="%(cssurl)s/?ln=%(ln)s">SCOAP<sup>3</sup> Repository (&beta;eta)</a>
  </div>
  </td>
 </tr>
 <tr style="background-color: #679A70;">
  <td style="padding: 10px; font-size: medium; color: #FFF;"><a href="/" style="text-decoration: none; color: #FFF;">HOME</a> :: <a href="http://scoap3.org/" style="text-decoration: none; color: #FFF;">SCOAP<sup>3</sup></a> :: <a href="http://scoap3.org/scoap3-repository-help" style="text-decoration: none; color: #FFF;">HELP</a> :: <a href="http://scoap3.org/scoap3-repository" style="text-decoration: none; color: #FFF;">ABOUT</a></td>
 </tr>
 <!--
 <tr class="menu">
       <td class="headermoduleboxbodyblank">
             &nbsp;
       </td>
       <td class="headermoduleboxbodyblank">
             &nbsp;
       </td>
       <td class="headermoduleboxbody%(search_selected)s">
             <a class="header%(search_selected)s" href="%(cssurl)s/?ln=%(ln)s">%(msg_search)s</a>
       </td>
       <td class="headermoduleboxbodyblank">
             &nbsp;
       </td>
       <td class="headermoduleboxbody%(submit_selected)s">
             <a class="header%(submit_selected)s" href="%(cssurl)s/submit?ln=%(ln)s">%(msg_submit)s</a>
       </td>
       <td class="headermoduleboxbodyblank">
             &nbsp;
       </td>
       <td class="headermoduleboxbody%(personalize_selected)s">
             %(useractivities)s
       </td>
       <td class="headermoduleboxbodyblank">
             &nbsp;
       </td>
       <td class="headermoduleboxbody%(help_selected)s">
             <a class="header%(help_selected)s" href="%(cssurl)s/help/%(langlink)s">%(msg_help)s</a>
       </td>
       %(adminactivities)s
       <td class="headermoduleboxbodyblanklast">
             &nbsp;
       </td>
 </tr>-->
</table>
</div>
<table class="navtrailbox">
 <tr>
  <td class="navtrailboxbody">
   %(navtrailbox)s
  </td>
 </tr>
</table>
<!-- end replaced page header -->
%(pageheaderadd)s
</div>
        """ % {
          'metabase': metabase,
          'rtl_direction': is_language_rtl(ln) and ' dir="rtl"' or '',
          'siteurl': CFG_SITE_URL,
          'sitesecureurl': CFG_SITE_SECURE_URL,
          'canonical_and_alternate_urls': self.tmpl_canonical_and_alternate_urls(uri),
          'cssurl': CFG_BASE_URL,
          'cssskin': CFG_WEBSTYLE_TEMPLATE_SKIN != 'default' and '_' + CFG_WEBSTYLE_TEMPLATE_SKIN or '',
          'rssurl': rssurl,
          'ln': ln,
          'ln_iso_639_a': ln.split('_', 1)[0],
          'langlink': '?ln=' + ln,

          'sitename': CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME),
          'pageheadertitle': HTMLWasher().wash(pageheadertitle),

          'sitesupportemail': CFG_SITE_SUPPORT_EMAIL,

          'description': cgi.escape(description, True),
          'keywords': cgi.escape(keywords, True),
          'metaheaderadd': metaheaderadd,

          'userinfobox': userinfobox,
          'navtrailbox': navtrailbox,
          'useractivities': useractivities_menu,
          'adminactivities': adminactivities_menu and ('<td class="headermoduleboxbodyblank">&nbsp;</td><td class="headermoduleboxbody%(personalize_selected)s">%(adminactivities)s</td>' % \
          {'personalize_selected': navmenuid.startswith('admin') and "selected" or "",
          'adminactivities': adminactivities_menu}) or '<td class="headermoduleboxbodyblank">&nbsp;</td>',

          'pageheaderadd': pageheaderadd,
          'body_css_classes': body_css_classes and ' class="%s"' % ' '.join(body_css_classes) or '',

          'search_selected': navmenuid == 'search' and "selected" or "",
          'submit_selected': navmenuid == 'submit' and "selected" or "",
          'personalize_selected': navmenuid.startswith('your') and "selected" or "",
          'help_selected': navmenuid == 'help' and "selected" or "",

          'msg_search': _("Search"),
          'msg_submit': _("Submit"),
          'msg_personalize': _("Personalize"),
          'msg_help': _("Help"),
          'unAPIurl': cgi.escape('%s/unapi' % CFG_SITE_URL),
          'linkbackTrackbackLink': headerLinkbackTrackbackLink,
          'hepDataAdditions': hepDataAdditions,
          'inspect_templates_message': inspect_templates_message
        }
        return out

예제 #15

0

파일 보기

def email_quoted_txt2html(text,
                          tabs_before=0,
                          indent_txt='>>',
                          linebreak_txt="\n",
                          indent_html=('<div class="commentbox">', "</div>"),
                          linebreak_html='<br/>'):
    """
    Takes a typical mail quoted text, e.g.:
        hello,
        you told me:
        >> Your mother was a hamster and your father smelt of elderberries
        I must tell you that I'm not convinced. Then in this discussion:
        >>>> Is there someone else up there we could talk to?
        >> No. Now, go away, or I shall taunt you a second time-a!
        I think we're not going to be friends!
    and return an html formatted output, e.g.:
        hello,<br/>
        you told me:<br/>
        <div>
          Your mother was a hamster and your father smelt of elderberries
        </div>
        I must tell you that I'm not convinced. Then in this discussion:
        <div>
          <div>
            Is there someone else up there we could talk to?
          </div>
          No. Now, go away, or I shall taunt you a second time-a!
        </div>
        I think we're not going to be friends!

    @param text: the text in quoted format
    @param tabs_before: number of tabulations before each line
    @param indent_txt: quote separator in email (default:'>>')
    @param linebreak_txt: line separator in email (default: '\n')
    @param indent_html: tuple of (opening, closing) html tags.
                        default: ('<div class="commentbox">', "</div>")
    @param linebreak_html: line separator in html (default: '<br/>')
    @return: string containing html formatted output
    """
    washer = HTMLWasher()

    final_body = ""
    nb_indent = 0
    text = text.strip('\n')
    lines = text.split(linebreak_txt)
    for line in lines:
        new_nb_indent = 0
        while True:
            if line.startswith(indent_txt):
                new_nb_indent += 1
                line = line[len(indent_txt):]
            else:
                break
        if (new_nb_indent > nb_indent):
            for dummy in range(nb_indent, new_nb_indent):
                final_body += tabs_before * "\t" + indent_html[0] + "\n"
                tabs_before += 1
        elif (new_nb_indent < nb_indent):
            for dummy in range(new_nb_indent, nb_indent):
                tabs_before -= 1
                final_body += (tabs_before) * "\t" + indent_html[1] + "\n"
        else:
            final_body += (tabs_before) * "\t"

        line = washer.wash(line)
        final_body += tabs_before * "\t" + line
        final_body += linebreak_html + "\n"
        nb_indent = new_nb_indent
    for dummy in range(0, nb_indent):
        tabs_before -= 1
        final_body += (tabs_before) * "\t" + "</div>\n"
    return final_body

예제 #16

0

파일 보기

파일: webmessage_mailutils.py 프로젝트: flannery/invenio-flannery

def email_quoted_txt2html(text,
                          tabs_before=0,
                          indent_txt='>>',
                          linebreak_txt="\n",
                          indent_html=('<div class="commentbox">', "</div>"),
                          linebreak_html='<br/>'):
    """
    Takes a typical mail quoted text, e.g.:
        hello,
        you told me:
        >> Your mother was a hamster and your father smelt of elderberries
        I must tell you that I'm not convinced. Then in this discussion:
        >>>> Is there someone else up there we could talk to?
        >> No. Now, go away, or I shall taunt you a second time-a!
        I think we're not going to be friends!
    and return an html formatted output, e.g.:
        hello,<br/>
        you told me:<br/>
        <div>
          Your mother was a hamster and your father smelt of elderberries
        </div>
        I must tell you that I'm not convinced. Then in this discussion:
        <div>
          <div>
            Is there someone else up there we could talk to?
          </div>
          No. Now, go away, or I shall taunt you a second time-a!
        </div>
        I think we're not going to be friends!

    @param text: the text in quoted format
    @param tabs_before: number of tabulations before each line
    @param indent_txt: quote separator in email (default:'>>')
    @param linebreak_txt: line separator in email (default: '\n')
    @param indent_html: tuple of (opening, closing) html tags.
                        default: ('<div class="commentbox">', "</div>")
    @param linebreak_html: line separator in html (default: '<br/>')
    @return: string containing html formatted output
    """
    washer = HTMLWasher()

    final_body = ""
    nb_indent = 0
    text = text.strip('\n')
    lines = text.split(linebreak_txt)
    for line in lines:
        new_nb_indent = 0
        while True:
            if line.startswith(indent_txt):
                new_nb_indent += 1
                line = line[len(indent_txt):]
            else:
                break
        if (new_nb_indent > nb_indent):
            for dummy in range(nb_indent, new_nb_indent):
                final_body += tabs_before*"\t" + indent_html[0] + "\n"
                tabs_before += 1
        elif (new_nb_indent < nb_indent):
            for dummy in range(new_nb_indent, nb_indent):
                tabs_before -= 1
                final_body += (tabs_before)*"\t" + indent_html[1] + "\n"
        else:
            final_body += (tabs_before)*"\t"

        line = washer.wash(line)
        final_body += tabs_before*"\t" + line
        final_body += linebreak_html + "\n"
        nb_indent = new_nb_indent
    for dummy in range(0, nb_indent):
        tabs_before -= 1
        final_body += (tabs_before)*"\t" + "</div>\n"
    return final_body

예제 #17

0

파일 보기

파일: bfe_webjournal_articles_overview.py 프로젝트: ppiotr/Invenio

def _get_feature_text(record, language):
    """
    Looks for a text (header) that can be featured on the article overview
    page.
    """
    washer = HTMLWasher()
    header_text = ""
    # Check if there is a header
    if language == "fr":
        header = record.field('590__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('520__a')
    else:
        header = record.field('520__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('590__a')
    header = washer.wash(html_buffer=header,
                         allowed_tag_whitelist=[],
                         allowed_attribute_whitelist=[])
    if header != "":
        header_text = header
    else:
        if language == "fr":
            article = record.fields('590__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('520__b')
        else:
            article = record.fields('520__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('590__b')
        try:
            article = article[0]
        except:
            return ''

        match_obj = re.search(header_pattern, article)
        if not match_obj:
            match_obj = re.search(header_pattern2, article)
        try:
            header_text = match_obj.group("header")
            header_text = washer.wash(html_buffer=header_text,
                                      allowed_tag_whitelist=['a'],
                                      allowed_attribute_whitelist=['href',
                                                                   'target',
                                                                   'class'])
            if header_text == "":
                raise Exception
        except:
            article = article.replace(header_text, '')
            article = article.replace('<p/>', '')
            article = article.replace('<p>&nbsp;</p>', '')
            match_obj = re.search(para_pattern, article)
            try:
                # get the first paragraph
                header_text = match_obj.group("paragraph")
                try:
                    header_text = washer.wash(html_buffer=header_text,
                                              allowed_tag_whitelist=[],
                                              allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    header_text = remove_html_markup(header_text)

                if header_text.strip() == "":
                    raise Exception
                else:
                    if len(header_text) > 250:
                        header_text = _get_first_sentence_or_part(header_text)
            except:
                # in a last instance get the first sentence
                try:
                    article = washer.wash(article,
                                          allowed_tag_whitelist=[],
                                          allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    article = remove_html_markup(article)

                header_text = _get_first_sentence_or_part(article)

    return header_text

예제 #18

0

파일 보기

파일: webmessage_mailutils.py 프로젝트: aw-bib/tind-invenio

def email_quoted_txt2html(
    text,
    tabs_before=0,
    indent_txt=">>",
    linebreak_txt="\n",
    indent_html=('<div class="commentbox">', "</div>"),
    linebreak_html="<br/>",
    indent_block=True,
):
    """
    Takes a typical mail quoted text, e.g.::
        hello,
        you told me:
        >> Your mother was a hamster and your father smelt of elderberries
        I must tell you that I'm not convinced. Then in this discussion:
        >>>> Is there someone else up there we could talk to?
        >> No. Now, go away, or I shall taunt you a second time-a!
        I think we're not going to be friends!

    and return an html formatted output, e.g.::
        hello,<br/>
        you told me:<br/>
        <div>
          Your mother was a hamster and your father smelt of elderberries
        </div>
        I must tell you that I'm not convinced. Then in this discussion:
        <div>
          <div>
            Is there someone else up there we could talk to?
          </div>
          No. Now, go away, or I shall taunt you a second time-a!
        </div>
        I think we're not going to be friends!

    The behaviour is different when C{indent_block} is C{True} or C{False}.
    When C{True} the when C{indent_html} is only added at each change of
    level of indentation, while it is added for each line when C{False}.
    For eg::
        >> a
        >> b
        >>>> c

    would result in (if C{True})::
        <div class="commentbox">
            a<br/>
            b<br/>
            <div class="commentbox">
                c<br/>
            </div>
        </div>

    or would be (if C{False})::
        <div class="commentbox"> a</div><br/>
        <div class="commentbox"> b</div><br/>
        <div class="commentbox"><div class="commentbox"> c</div></div><br/>

    @param text: the text in quoted format
    @param tabs_before: number of tabulations before each line
    @param indent_txt: quote separator in email (default:'>>')
    @param linebreak_txt: line separator in email
    @param indent_html: tuple of (opening, closing) html tags.
                        default: ('<div class="commentbox">', "</div>")
    @param linebreak_html: line separator in html (default: '<br/>')
    @param indent_block: if indentation should be done per 'block'
                         i.e. only at changes of indentation level
                         (+1, -1) or at each line.
    @return: string containing html formatted output
    """
    washer = HTMLWasher()

    final_body = ""
    nb_indent = 0
    text = text.strip("\n")
    lines = text.split(linebreak_txt)
    for line in lines:
        new_nb_indent = 0
        while True:
            if line.startswith(indent_txt):
                new_nb_indent += 1
                line = line[len(indent_txt) :]
            else:
                break
        if indent_block:
            if new_nb_indent > nb_indent:
                for dummy in range(nb_indent, new_nb_indent):
                    final_body += tabs_before * "\t" + indent_html[0] + "\n"
                    tabs_before += 1
            elif new_nb_indent < nb_indent:
                for dummy in range(new_nb_indent, nb_indent):
                    tabs_before -= 1
                    final_body += (tabs_before) * "\t" + indent_html[1] + "\n"
            else:
                final_body += (tabs_before) * "\t"
        else:
            final_body += tabs_before * "\t" + new_nb_indent * indent_html[0]
        try:
            line = washer.wash(line)
        except HTMLParseError:
            # Line contained something like "foo<bar"
            line = cgi.escape(line)
        if indent_block:
            final_body += tabs_before * "\t"
        final_body += line
        if not indent_block:
            final_body += new_nb_indent * indent_html[1]
        final_body += linebreak_html + "\n"
        nb_indent = new_nb_indent
    if indent_block:
        for dummy in range(0, nb_indent):
            tabs_before -= 1
            final_body += (tabs_before) * "\t" + "</div>\n"
    return final_body

예제 #19

0

파일 보기

파일: bfe_webjournal_article_body.py 프로젝트: valkyriesavage/invenio

def format_element(bfo, separator='<br/>'):
    """
    Display article body

    @param separator: separator between each body
    """
    # Retrieve context (journal, issue and category) from URI
    args = parse_url_string(bfo.user_info['uri'])
    ln = args["ln"]
    _ = gettext_set_language(ln)

    if ln == "fr":
        article = bfo.fields('590__b')
        if not article or \
               (len(article) == 1 and \
                (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])):
            article = bfo.fields('520__b')
    else:
        article = bfo.fields('520__b')
        if not article or \
               (len(article) == 1 and \
                (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])):
            article = bfo.fields('590__b')

    if not CFG_CERN_SITE or \
           not bfo.field('980__a').startswith('BULLETIN'):
        return separator.join(article)

    ################################################################
    #                  CERN Bulletin-specific code                 #
    ################################################################

    # We need a compatibility layer for old CERN Bulletin
    # articles. Identify them and process them if needed.
    is_old_cern_bulletin_article = False
    if bfo.field('980__a').startswith('BULLETIN'):
        try:
            year = int(bfo.fields('260__c')[0])
        except IndexError:
            year = 2000
        if year < 2009 or \
           (bfo.field('980__a').startswith('BULLETINSTAFF') and \
            ("CERN EDS" in bfo.field('595__a'))):
            is_old_cern_bulletin_article = True

    header_out = ''
    if not is_old_cern_bulletin_article:
        # Return the same as any other journal article
        return separator.join(article)

    # Old CERN articles
    if year < 2007 or bfo.field('980__a').startswith('BULLETINSTAFF'):
        # Really old CERN articles
        if len(article) > 0:
            # CERN-only: old CERN Bulletin articles
            return __backward_compatible_HTML(article[0]) + \
                   (bfo.field('980__a').startswith('BULLETINSTAFF') and \
                    ('<br/><br/>' + bfe_fulltext.format_element(bfo, style="", show_icons='yes')) \
                    or '')
        else:
            return ''

    # Not-so-old CERN articles follow:

    # 2. prepare regex's for the elements
    #=====================================================
    from invenio.webjournal_utils import \
         image_pattern, \
         para_pattern, \
         header_pattern

    page_elements = {}

    # 3. get the header (either from marc xml or regex)
    #=====================================================
    if bfo.lang == "fr":
        header = bfo.field('590__a')
        if header == '':
            header = bfo.field('520__a')
    else:
        header = bfo.field('520__a')
        if header == '':
            header = bfo.field('590__a')

    if not header:
        try:
            header_obj = re.search(header_pattern, article[0])
            header_text = header_obj.group("header")
        except:
            header_text = ""
    else:
        header_text = header


    washer = HTMLWasher()
    header_text_clean = washer.wash(html_buffer=header_text,
                                    allowed_tag_whitelist=['a'],
                                    allowed_attribute_whitelist=['href'])

    header_out = '<p class="articleHeader">' + header_text_clean + '</p>'

    # strip out all empty p tags and the header
    try:
        article = article[0].replace("<p/>", "")
        article = article.replace(header_text, "")
        article = article.replace(header_text_clean, "")
    except IndexError:
        article = ""

    image_iter = image_pattern.finditer(article)

    difference_from_original = 0
    for image in image_iter:
        page_elements[image.start()] = {"link" : image.group("hyperlink"),
                                        "image" : image.group("image"),
                                        "caption" : image.group("caption")}
        # make sure we delete the image from the article (else might be used twice)
        start_index = image.span()[0] - difference_from_original
        end_index = image.span()[1] - difference_from_original
        article = article.replace(article[start_index:end_index], "")
        difference_from_original += image.span()[1] - image.span()[0]


    # replace <center> by <p><center>
    article = article.replace("<center>", "<p><center>")
    article = article.replace("</center>", "</center></p>")

    para_iter = para_pattern.finditer(article)

    for paragraph in para_iter:
        page_elements[paragraph.start()] = paragraph.group("paragraph")


    # TODO: find a way to do this inline in the dict
    ordered_keys = page_elements.keys()
    ordered_keys.sort()

    article_out = ""
    left_right_lever = True
    did_you_know_box = False
    for key in ordered_keys:
        if type(page_elements[key]) == types.DictType:
            if left_right_lever == True:
                article_out += '<div class="phrwithcaption"><div class="imageScale">'
            else:
                article_out += '<div class="phlwithcaption"><div class="imageScale">'
            if page_elements[key]["link"] != None:
                article_out += '<a href="' + page_elements[key]["link"] + '">'
            article_out += '<img class="featureImageScaleHolder" src="' + \
                           page_elements[key]["image"] + '" border="0" />' + \
                           '</a>' + \
                           '</div>'
            if page_elements[key]["caption"] != None:
                article_out += '<p>' + page_elements[key]["caption"] + \
                               '</p>'
            article_out += '</div>'
        elif type(page_elements[key]) == types.StringType:
            left_right_lever = not left_right_lever
            if (page_elements[key].lower().find("did you know") != -1) or \
                   (page_elements[key].lower().find("le saviez-vous ?") != -1):
                did_you_know_box = True
                continue
            if did_you_know_box == True:
                did_you_know_box = False
                article_out += __did_you_know_box(page_elements[key],
                                                  left_right_lever,
                                                  bfo.lang)
                continue
            article_out += '<p>'
            article_out += page_elements[key]
            article_out += '</p>'

    return header_out + article_out

예제 #20

0

파일 보기

파일: htmlutils_unit_tests.py 프로젝트: aw-bib/tind-invenio

 def __init__(self, methodName='test'):
     self.washer = HTMLWasher()
     InvenioTestCase.__init__(self, methodName)

예제 #21

0

파일 보기

 def __init__(self, methodName='test'):
     self.washer = HTMLWasher()
     InvenioTestCase.__init__(self, methodName)

예제 #22

0

파일 보기

 def __init__(self, methodName='test'):
     self.washer = HTMLWasher()
     unittest.TestCase.__init__(self, methodName)