Python BeautifulSoup 예제들, sigil_bs4.BeautifulSoup Python 예제들

예제 #1

0

파일 보기

def run(bk):
    if any(bk.selected_iter()):
        on_selected = True
    else:
        on_selected = False
    for file_id, file_href in files_iter(bk, on_selected):
        xhtml_file = bk.readfile(file_id)
        xhtml_soup = sigil_bs4.BeautifulSoup(xhtml_file, 'lxml')
        if xhtml_soup.h1:
            header = xhtml_soup.h1.text
        elif xhtml_soup.h2:
            header = xhtml_soup.h2.text
        elif xhtml_soup.h3:
            header = xhtml_soup.h3.text
        elif xhtml_soup.h4:
            header = xhtml_soup.h4.text
        elif xhtml_soup.h5:
            header = xhtml_soup.h5.text
        elif xhtml_soup.h6:
            header = xhtml_soup.h6.text
        else:
            header = ''
        if not xhtml_soup.head.title:
            title = xhtml_soup.new_tag("title")
            xhtml_soup.head.append(title)
        xhtml_soup.head.title.string = header
        bk.writefile(file_id, xhtml_soup.prettyprint_xhtml(indent_chars="  "))
    return 0

예제 #2

0

파일 보기

파일: sigil_gumbo_bs4_adapter.py 프로젝트: zjj00520/Sigil

def parse(text, **kwargs):
    with gumboc.parse(text, **kwargs) as output:
        soup = sigil_bs4.BeautifulSoup('', "html.parser")
        _add_document(soup, output.contents.document.contents)
        for node in output.contents.document.contents.children:
            soup.append(_add_node(soup, node))
        _add_next_prev_pointers(soup.html)
        return soup

예제 #3

0

파일 보기

파일: plugin.py 프로젝트: dreamer2908/Sigil-Plugins

def getCoverImageID(bk):
    # get cover image id from metadata
    coverImgID = ''
    metadata = bk.getmetadataxml()
    stinx = sigil_bs4.BeautifulSoup(metadata, 'xml')
    for node in stinx.find_all('meta'):
        if node.get('name') == 'cover':
            coverImgID = node.get('content')
            break

    return coverImgID

예제 #4

0

파일 보기

파일: plugin.py 프로젝트: dreamer2908/Sigil-Plugins

def editIdentifierInToC(bk, BookId):
    # read toc file contents
    tocManifestId = bk.gettocid()
    tocXml = bk.readfile(tocManifestId)
    tocSoup = sigil_bs4.BeautifulSoup(tocXml, 'xml')
    metaNode = tocSoup.find('head')

    # change the content of the identifier
    for node in metaNode.find_all('meta'):
        if node.get('name') == "dtb:uid":
            node['content'] = BookId
            print('Setting identifier in ToC: %s' % node)

    # write back
    bk.writefile(tocManifestId, tocSoup.prettify())

예제 #5

0

파일 보기

파일: plugin.py 프로젝트: dreamer2908/Sigil-Plugins

def setCoverImageID(bk, coverImgID):
    # set metadata: cover
    metadata_xml = bk.getmetadataxml()
    metadata_soup = sigil_bs4.BeautifulSoup(metadata_xml, 'xml')
    metadata_node = metadata_soup.find('metadata')

    if coverImgID:
        for node in metadata_node.find_all('meta'):  # remove existing info
            if node.get('name') == 'cover':
                node.decompose()
        meta_cover_tag = metadata_soup.new_tag('meta')
        meta_cover_tag['name'] = 'cover'
        meta_cover_tag['content'] = coverImgID
        metadata_node.append(meta_cover_tag)

        bk.setmetadataxml(str(metadata_soup))

예제 #6

0

파일 보기

파일: plugin.py 프로젝트: googed/Sigil-Ebook-Plugins

def run(bk):
    print('start')
    for (file_id, _) in bk.text_iter():
        modified = False
        html = bk.readfile(file_id)
        soup = sigil_bs4.BeautifulSoup(html)
        # br tag  will cause p tag cannot be found
        for elem in soup.findAll(['p', 'div', 'span'],
                                 text=re.compile('(\d+)')):
            modified = True
            text = elem.string
            for key in conversionDict:
                text = re.sub(key, conversionDict[key], text)
            elem.string.replace_with(text)
            # print(elem.string)
        if modified:
            print("Modifed File -> ", id)
            bk.writefile(file_id, fixSelfCloseTags(str(soup)))
    return 0

예제 #7

0

파일 보기

def run(bk):
    if any(bk.selected_iter()):
        on_selected = True
    else:
        on_selected = False
    for file_id, file_href in files_iter(bk, on_selected):
        xhtml_file = bk.readfile(file_id)
        xhtml_soup = sigil_bs4.BeautifulSoup(xhtml_file, 'lxml')
        # There's a typo in bk.href_to_basename until version 0.9.5 of Sigil
        if bk.launcher_version() <= 20160325:
            file_name = href_to_basename(file_href)
        else:
            file_name = bk.href_to_basename(file_href)
        if xhtml_soup.head.title:
            xhtml_soup.head.title.string = file_name[:file_name.rindex(".")]
        else:
            title = xhtml_soup.new_tag("title")
            title.string = file_name[:file_name.rindex(".")]
            xhtml_soup.head.append(title)
        bk.writefile(file_id, xhtml_soup.prettyprint_xhtml(indent_chars="  "))
    return 0

예제 #8

0

파일 보기

def parse_xml(bk: 'BookContainer', collector: XHTMLAttributes,
              prefs: MutableMapping) -> XHTMLAttributes:
    fragid_container_attrs = prefs[
        'fragid_container_attrs'] or collector.fragid_container_attrs
    xhtml_files = set(id_ for id_, href in bk.text_iter())
    for file_id, href, mime in bk.manifest_iter():
        # if file is xhtml or not xml, skip ahead
        if file_id in xhtml_files or not re.search(r'[/+]xml\b', mime):
            continue
        try:
            soup = sigil_bs4.BeautifulSoup(bk.readfile(file_id), 'lxml-xml')
        except Exception as E:
            raise XMLParsingError('Error in {}: {}'.format(
                utils.href_to_basename(href), E))
        for elem in soup.find_all(True):
            # gather fragment identifiers, if present
            for attr in fragid_container_attrs:
                fragid = get_fragid(elem, attr)
                if fragid:
                    collector.fragment_identifier.add(fragid)
    return collector

예제 #9

0

파일 보기

파일: plugin.py 프로젝트: dreamer2908/Sigil-Plugins

def newIdentifierInMetadata(bk):
    metadata_xml = bk.getmetadataxml()
    metadata_soup = sigil_bs4.BeautifulSoup(metadata_xml, 'xml')
    metadata_node = metadata_soup.find('metadata')

    # remove the old identifier
    for node in metadata_node.find_all('identifier'):
        if node.get('id') == "BookId":
            node.decompose()

    # print('Creating a new BookID.')
    BookId = uuid.uuid4().urn
    id_node = metadata_soup.new_tag('dc:identifier')
    id_node['id'] = "BookId"
    id_node['opf:scheme'] = "UUID"
    id_node.string = BookId
    metadata_node.append(id_node)

    print('Setting metadata: %s' % id_node)

    bk.setmetadataxml(str(metadata_soup))

    return BookId

예제 #10

0

파일 보기

def run(bk):
    # get python plugin path
    global plugin_path
    plugin_path = os.path.join(bk._w.plugin_dir, plugin_name)

    for (textID, textHref) in bk.text_iter():
        print('\nProcessing text file: %s' % textHref)

        textContents = bk.readfile(
            textID)  # Read the section into textContents
        if not isinstance(
                textContents, text_type
        ):  # If the section is not str then sets its type to 'utf-8'
            textContents = text_type(textContents, 'utf-8')

        soup = sigil_bs4.BeautifulSoup(textContents, "xml")

        # TODO: near square image?
        # done in getSvgForImage. not yet backport to baka-epub

        useImgForLandscape = False
        svgSizePercent = 98

        removeMe = []
        for divNode in soup.find_all("div"):
            if divNode.has_attr('class') and "svg_outer" in divNode['class']:
                for imgNode in divNode.find_all(["img", "svg"]):
                    if imgNode.name == 'img':
                        imgSrc = imgNode['src']
                    else:
                        imgSrc = imgNode.image['xlink:href']
                    if imgSrc.startswith('../'): imgSrc = imgSrc[3:]
                    imgID = bk.href_to_id(imgSrc)
                    if imgID:  # image file exists
                        print('Found image: ' + imgSrc)
                        if (len(bk.readfile(imgID)) == 0):
                            print('Zero-length file. Removing...')
                            removeMe.append(divNode)
                        else:
                            _useImg = useImgForLandscape
                            if "svg_yes" in divNode['class']:
                                _useImg = False
                            _svgSizePercent = svgSizePercent
                            if "svg_100" in divNode['class']:
                                _svgSizePercent = 100
                            svgNode = sigil_bs4.BeautifulSoup(
                                getSvgForImage(bk,
                                               imgID,
                                               svgSizePercent=_svgSizePercent,
                                               useImgForLandscape=_useImg,
                                               dontWrapInDiv=True), "xml")
                            imgNode.replace_with(svgNode)
                    else:
                        print('404 error: ' + imgSrc + '. Removing...')
                        removeMe.append(divNode)

        for element in removeMe:
            element.decompose()

        textContents = str(soup)
        textContents = '<?xml version="1.0" encoding="utf-8"?>' + re.sub(
            '<\?xml\s.*?\?>', '', textContents)
        bk.writefile(textID, textContents)

    print('Done.')
    return 0