Python bs 예제들, mcdp_utils_xml.parsing.bs Python 예제들

예제 #1

0

파일 보기

파일: escape.py 프로젝트: rusi/mcdp

def escape_ticks_before_markdown(html):
    """ Escapes backticks and quotes in code 
    
        Also removes comments <!--- -->
    """
    soup = bs(html)
    for code in soup.select('code, pre, mcdp-poset, mcdp-value, mcdp-fvalue, mcdp-rvalue, render'):
        if not code.string:
            continue
        #unicode
        s = code.string
        if '`' in code.string:
            s = s.replace('`', '&#96;')
#             print('replacing %r -> %r' %(code.string, s))
            
            
        if '"' in code.string:
            s = s.replace('"', '&quot;')
#             print('replacing %r -> %r' %(code.string, s))
            
        code.string = s
    
    comments=soup.find_all(string=lambda text:isinstance(text, bs4.Comment))
    for c in comments:
#         print('stripping comment %s' % str(c))
        c.extract()
    
    res = to_html_stripping_fragment(soup)
     
    return res

예제 #2

0

파일 보기

def test_toc():
    s = """
<html>
<head></head>
<body>
<h1 id='one'>One</h1>

<p>a</p>

<h2 id='two'>Two</h2>

<p>a</p>

<h3 id='three'>Three</h3>

<h2 id='four'>Four</h2>

<p>a</p>
</body>
</html>
    """
    soup = bs(s)
    #     print(soup)
    #     body = soup.find('body')
    _toc = generate_toc(soup)
    s = str(soup)
    expected = ['sec:one', 'sub:two']
    #     print(indent(s, 'transformed > '))
    for e in expected:
        assert e in s

예제 #3

0

파일 보기

파일: markd.py 프로젝트: kannode/mcdp

def render_markdown(s, fix_blockquote_pre=True):
    """ Returns an HTML string encoded in UTF-8"""
    if isinstance(s, unicode):
        msg = 'I expect utf-8 encoded bytes.'
        raise_desc(TypeError, msg, s=s.__repr__())

    import markdown  # @UnresolvedImport
    import logging
    logging.getLogger("MARKDOWN").setLevel(logging.CRITICAL)

    extensions = [
        'markdown.extensions.smarty',
#         'markdown.extensions.toc',
        'markdown.extensions.attr_list',
        'markdown.extensions.extra',  # need for markdown=1
        'markdown.extensions.fenced_code',
        'markdown.extensions.admonition',
        'markdown.extensions.tables',
    ]

    # markdown takes and returns unicode
    u = unicode(s, 'utf-8')
    html = markdown.markdown(u, extensions)
    html = html.encode('utf-8')

    if fix_blockquote_pre:
        if 'blockquote' in html:
            soup = bs(html)
            for code in soup.select('blockquote > p > code'):
                code.parent.name = 'pre'
            html = to_html_stripping_fragment(soup)
    return html

예제 #4

0

파일 보기

def test_toc2():
    s = """
<html>
<head></head>
<body>
<h1>One</h1>
<h1>Two</h1>
<h1>Three</h1>
<p></p>

<h2>A</h2>

<h2>B</h2>

<h2>C</h2>

<h3>a</h3>
<h3>b</h3>
<h3>c</h3>

</body>
</html>
    """
    soup = bs(s)
    #     print(soup)
    #     body = soup.find('body')
    _toc = generate_toc(soup)
    s = str(soup)

예제 #5

0

파일 보기

def make_page(contents, head0, add_toc, extra_panel_content, add_home_link):
    """ Returns html (Beautiful Soup document) """
    html = Tag(name='html')

    head = head0.__copy__()
    html.append(head)
    body = Tag(name='body')

    with timeit('make_page() / copy toc'):
        if add_toc is not None:
            tocdiv = Tag(name='div')
            tocdiv.attrs['id'] = 'tocdiv'
            if add_home_link:
                a = Tag(name='a')
                a.append('Home')
                a.attrs['href'] = 'index.html'
                p = Tag(name='p')
                p.append(a)
                tocdiv.append(p)

            if extra_panel_content is not None:
                details = Tag(name='details')
                details.attrs['id'] = 'build-details'
                summary = Tag(name='summary')
                summary.append('build details')
                details.append(summary)
                details.append(extra_panel_content)
                tocdiv.append(details)

            tocdiv.append(add_toc)

            body.append(tocdiv)

    section_name = get_first_header_title(contents)
    if section_name is not None:
        section_name = section_name.replace('</code>', '</code> ')
        section_name = gettext(bs(section_name))
        title2 = Tag(name='title')
        title2.append(section_name)

        title = head.find('title')
        if title is None:
            head.append(title2)
        else:
            title.replace_with(title2)

    not_toc = Tag(name='div')
    not_toc.attrs['id'] = 'not-toc'
    not_toc.append(contents)
    body.append(not_toc)
    html.append(body)

    # delete the original one
    if False:
        main_toc = contents.find(id=MCDPManualConstants.MAIN_TOC_ID)
        if main_toc is not None:
            main_toc.extract()

    return html

예제 #6

0

파일 보기

파일: element_abbrevs_test.py 프로젝트: rusi/mcdp

def elements_abbrevs_test1():
    s = "<p>TODO: paragraph</p>"
    e = """<div class="todo-wrap"><p class="todo">paragraph</p></div>"""
    soup = bs(s.strip())

    substitute_special_paragraphs(soup)

    o = to_html_stripping_fragment(soup)
    #print o
    assert_equal(o, e)

예제 #7

0

파일 보기

파일: task_markers_test.py 프로젝트: afcarl/mcdp

def task_markers_test1():
    s = "<p>We should do this (TODO)</p>"
    e = """<p class="status-todo">We should do this (TODO)</p>"""
    soup = bs(s.strip())

    substitute_task_markers(soup)

    o = to_html_stripping_fragment(soup)
    #print o
    assert_equal(o, e)

예제 #8

0

파일 보기

def link_to_command_explanation_check2():
    s = """
    <pre class="console"><code><span class="console_sign">$</span><span class="space"> </span><span class="curl program">curl</span><span class="space"> </span><span class="program_option">-o</span><span class="space"> </span>duckiebot-RPI3-AC-aug10.img.xz<span class="space"> </span><span class="placeholder">URL above</span>
</code></pre>"""

    soup = bs(s)
    link_to_command_explanation(soup)
    s2 = str(soup)
    print s2
    assert '<a href="#curl"' in s2

예제 #9

0

파일 보기

def link_to_command_explanation_check1():
    s = """
<pre class='console'>
<span class='program'>ls</span> file
</pre>
    """
    soup = bs(s)
    link_to_command_explanation(soup)
    s2 = str(soup)
    # print s2
    assert '<a href="#ls"' in s2

예제 #10

0

파일 보기

def displayfile1():
    defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'}

    s = """
<display-file src="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> 
"""
    soup = bs(s)
    n = display_files(soup, defaults, raise_errors=True)
    assert n == 1

    s2 = str(soup)
    logger.debug('\n' + indent(s2, '  '))

예제 #11

0

파일 보기

def sub1():
    defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'}

    s = """
<a href="github:path=context_eval_as_constant.py"></a> 
"""
    soup = bs(s)
    n = substitute_github_refs(soup, defaults)
    assert n == 1

    s2 = str(soup)
    logger.debug(indent(s2, '  '))

    expect = '<code class="github-resource-link">context_eval_as_constant.py</code>'
    if not expect in s2:
        raise Exception(s2)

예제 #12

0

파일 보기

def sub2():
    defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'}

    s = """
<a href="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> 
"""
    soup = bs(s)
    n = substitute_github_refs(soup, defaults)
    assert n == 1

    s2 = str(soup)
    logger.debug('\n' + indent(s2, '  '))

    expect = 'context_eval_as_constant.py#L7-L12'

    if not expect in s2:
        raise Exception('No %s in %s' % (expect, s2))

예제 #13

0

파일 보기

def tags_in_titles2():
    template = """
    <html>
    <head>
        </head>
    <body>
        <div id='toc'></div>
    </body>
    </html>
"""
    s = """


<span id='frag'>I will refer to <a href="#two" class='number_name'></a></span>

# One is ok {#one}

Ignore

# Two with `program` {#two}

Another.
 
    
"""
    library = MCDPLibrary()
    raise_errors = True
    realpath = 'transformations.py'
    s2 = render_complete(library, s, raise_errors, realpath, generate_pdf=False)
    files_contents= [DocToJoin(docname='one', contents=s2, source_info=None)]
    stylesheet = 'v_manual_blurb_ready'
    res_aug = manual_join(template, files_contents,
                stylesheet, remove=None, extra_css=None,
                remove_selectors=None,
                hook_before_toc=None)

    soup = bs(res_aug.get_result())
    element = soup.find(id='main_toc')
    print element
    if 'fragment' in str(element):
        raise Exception(str(element))

예제 #14

0

파일 보기

def tags_in_titles1():
    template = """
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
    <html lang="en">
    <head>
        <title>The Duckietown book</title>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    </head>
<body>
</body>
</html>
"""
    s = """


<span id='frag'>I will refer to <a href="#two" class='number_name'></a></span>

# Two with `program` {#two}

Another.
 
    
"""
    library = MCDPLibrary()
    raise_errors = True
    realpath = 'transformations.py'
    s2 = render_complete(library, s, raise_errors, realpath, generate_pdf=False)
    files_contents= [DocToJoin(docname='one', contents=s2, source_info=None)]
    stylesheet = 'v_manual_blurb_ready'
    res_aug = manual_join(template=template, files_contents=files_contents,
                stylesheet=stylesheet, remove=None, extra_css=None,
                remove_selectors=None,
                hook_before_toc=None)
    soup = bs(res_aug.get_result())
    element = soup.find(id='frag')
    print element
    if '&lt;code&gt;' in str(element):
        raise Exception(str(element))

예제 #15

0

파일 보기

파일: split.py 프로젝트: rusi/mcdp

def split_file(html, directory):
    soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8')
    body = soup.html.body
    # extract the main toc if it is there
    main_toc = body.find(id='main_toc')
#     if main_toc: 
#         main_toc.extract()
        
    assert body is not None, soup
    filename2contents = split_in_files(body)
    add_prev_next_links(filename2contents)
    for filename, contents in list(filename2contents.items()):
        html = Tag(name='html')
        head = soup.html.head.__copy__()
        html.append(head)
        body = Tag(name='body')
        if main_toc:
            tocdiv = Tag(name='div')
            tocdiv.attrs['id'] = 'tocdiv'
            tocdiv.append(main_toc.__copy__())
        body.append(tocdiv)
        body.append(contents)
        html.append(body)
    
        PAGE_IDENTIFIER = filename.replace('.html', '')
        PAGE_URL = 'https://duckietown.github.io/duckuments/master/' + filename
        s = disqus
        s = s.replace('PAGE_IDENTIFIER', PAGE_IDENTIFIER)    
        s = s.replace('PAGE_URL', PAGE_URL)
        disqus_section = bs(s)
        from mcdp import logger
        logger.info(str(s))
        body.append(disqus_section)
        
        filename2contents[filename] = html
    
    update_refs(filename2contents)
    
    write_split_files(filename2contents, directory)

예제 #16

0

파일 보기

def link_to_command_explanation_check3():
    s = """
 <fragment><div style="display:none">Because of mathjax bug</div>
<h1 id="networking">Networking tools</h1>
<div class="special-par-assigned-wrap"><p class="special-par-assigned">Andrea</p></div>
<div class="requirements">
<p>Preliminary reading:</p>
<ul>
<li>
<p>Basics of networking, including</p>
<ul>
<li>what are IP addresses</li>
<li>what are subnets</li>
<li>how DNS works</li>
<li>how <code>.local</code> names work</li>
<li>…</li>
</ul>
 </li>
</ul>
<div class="special-par-see-wrap"><p class="status-XXX special-par-see"> (ref to find).</p></div>
</div>
<div class="todo-wrap"><p class="todo">to write</p></div>
<p>Make sure that you know:</p>
<h2 id="visualizing-information-about-the-network">Visualizing information about the network</h2>
<h3 id="ping-are-you-there"><code>ping</code>: are you there?</h3>
<div class="todo-wrap"><p class="todo">to write</p></div>
<h3 id="ifconfig"><code>ifconfig</code></h3>
<div class="todo-wrap"><p class="todo">to write</p></div>
<pre class="console"><code><span class="console_sign">$</span><span class="space"> </span><span class="ifconfig program">ifconfig</span>
</code></pre></fragment>

"""
    soup = bs(s)
    link_to_command_explanation(soup)
    s2 = str(soup)
    #     print s2
    assert '<a href="#ifconfig"' in s2

예제 #17

0

파일 보기

파일: footnote_javascript.py 프로젝트: rusi/mcdp

def add_footnote_polyfill(soup):
    body = soup.find('body')
    x = bs(footnote_javascript)
    body.append(x)

예제 #18

0

파일 보기

파일: note_errors_inline.py 프로젝트: kannode/mcdp

    s = ''
    for element in soup.select('details.' + ERROR_CLASS):
        summary = element.summary.text.encode('utf8')
        e2 = element.__copy__()
        e2.summary.extract()
        other = e2.text.encode('utf8')
        s0 = summary + '\n\n' + other
        s += '\n\n' + indent(s0, '', '* ')
    return s


if __name__ == '__main__':
    filename = sys.argv[1]
    data = open(filename).read()
    soup = bs(data)
    s = search_for_errors(soup)
    if s:
        logger.error('Found a few errors:')
        logger.error(s)
    else:
        logger.info('No errors found.')


@contract(long_error='str|$Tag')
def insert_inset(element, short, long_error, klasses=[]):
    """ Inserts an errored details after element """
    details = Tag(name='details')
    summary = Tag(name='summary')
    s = Tag(name='strong')
    s.append(short)

예제 #19

0

파일 보기

def go(context,
       worker_i,
       num_workers,
       data,
       mathjax,
       preamble,
       output_dir,
       assets_dir,
       add_toc_if_not_existing,
       extra_panel_content,
       permalink_prefix=None,
       output_crossref=None,
       only_refs=False):
    res = AugmentedResult()
    soup = bs_entire_document(data)

    # extract the main toc if it is there
    with timeit("Extracting main toc"):
        main_toc = soup.find(id=MCDPManualConstants.MAIN_TOC_ID)

        if main_toc is None:

            if add_toc_if_not_existing:
                # logger.info('Generating TOC because it is not there')

                tocg = generate_toc(soup)
                main_toc = bs(tocg).ul
                main_toc.attrs['class'] = 'toc'  # XXX: see XXX13
                assert main_toc is not None
                substituting_empty_links(main_toc,
                                         raise_errors=False,
                                         res=res,
                                         extra_refs=soup)

            else:
                msg = 'Could not find main toc (id #%s)' % MCDPManualConstants.MAIN_TOC_ID
                res.note_error(msg)
                main_toc = Tag(name='div')
                main_toc.append('TOC NOT FOUND')
        else:
            main_toc = main_toc.__copy__()

        if 'id' in main_toc.attrs:
            del main_toc.attrs['id']

    # XXX: this is not the place to do it
    mark_toc_links_as_errored(main_toc, soup)

    body = soup.html.body

    with timeit("split_in_files"):
        filename2contents = split_in_files(body)
        id2filename = get_id2filename(filename2contents)

    res.set_result(id2filename)

    if output_crossref is not None:
        from mcdp_docs.mcdp_render_manual import write_crossref_info
        context.comp(write_crossref_info,
                     data=data,
                     id2filename=id2filename,
                     output_crossref=output_crossref,
                     permalink_prefix=permalink_prefix)

    if only_refs:
        logger.debug('Skipping rest because only_refs')
        return res

    with timeit("add_prev_next_links"):
        filename2contents = add_prev_next_links(filename2contents)

    with timeit("preparing assets dir"):
        if not os.path.exists(output_dir):
            try:
                os.makedirs(output_dir)
            except:
                pass

    with timeit("creating link.html and link.js"):

        linkbase = 'link.html'  # do not change (it's used by http://purl.org/dth)
        linkbasejs = 'link.js'

        lb = create_link_base(id2filename)
        write_data_to_file(str(lb),
                           os.path.join(output_dir, linkbase),
                           quiet=True)

        linkjs = create_link_base_js(id2filename)
        write_data_to_file(str(linkjs),
                           os.path.join(output_dir, linkbasejs),
                           quiet=True)

    if preamble is not None:
        if preamble.endswith('.tex'):  # XXX
            preamble = open(preamble).read()

    ids_to_use = []
    for k in list(id2filename):
        if not 'autoid' in k:
            ids_to_use.append(k)
    ids_to_use = sorted(ids_to_use)

    pointed_to = []
    for k in ids_to_use:
        f = id2filename[k]
        if not f in pointed_to:
            pointed_to.append(f)

    # data = ",".join(pointed_to)
    head0 = soup.html.head

    if True:
        context.comp(remove_spurious, output_dir, list(filename2contents))

    with timeit('main_toc copy'):
        main_toc0 = main_toc.__copy__()

        main_toc0_s = str(main_toc0)
    asset_jobs = []
    for i, (filename, contents) in enumerate(filename2contents.items()):
        if i % num_workers != worker_i:
            continue
        with timeit('main_toc copy hack'):
            main_toc = bs(main_toc0_s).ul
            assert main_toc is not None

        # Trick: we add the main_toc, and then ... (look below)
        with timeit('make_page'):
            add_home_link = 'index.html' not in filename2contents
            html = make_page(contents,
                             head0,
                             main_toc,
                             extra_panel_content,
                             add_home_link=add_home_link)

        with timeit("direct job"):
            result = only_second_part(mathjax, preamble, html, id2filename,
                                      filename)

            # ... we remove it. In this way we don't have to copy it every time...
            main_toc.extract()

            fn = os.path.join(output_dir, filename)

            h = get_md5(result)[:8]
            r = context.comp(extract_assets_from_file,
                             result,
                             fn,
                             assets_dir,
                             job_id='%s-%s-assets' % (filename, h))
            asset_jobs.append(r)

    update_refs_('toc.html', main_toc, id2filename)
    out_toc = os.path.join(output_dir, 'toc.html')
    write_data_to_file(str(main_toc), out_toc, quiet=True)

    return context.comp(wait_assets, res, asset_jobs)