示例#1
0
def xml2leo(event, from_string=None):
    """handle import of an .xml file, places new subtree after c.p
    """
    c = event['c']
    p = c.p

    if from_string:
        parser_func = etree.fromstring
        file_name = from_string
    else:
        parser_func = etree.parse
        cd_here(c, p)
        file_name = g.app.gui.runOpenFileDialog(c,
                                                title="Open",
                                                filetypes=table,
                                                defaultextension=".xml")

        if not file_name:
            raise Exception("No file selected")

    try:
        xml_ = parser_func(file_name)
    except etree.XMLSyntaxError:
        xml_ = parser_func(file_name, parser=etree.HTMLParser())
    except Exception:
        g.es("Failed to read '%s'" % file_name)
        raise

    if from_string:
        # etree.fromstring and etree.parse return Element and
        # ElementTree respectively
        xml_ = etree.ElementTree(xml_)

    nd = p.insertAfter()
    nd.h = os.path.basename(file_name)

    # the root Element isn't necessarily the first thing in the XML file
    # move to the beginning of the list to capture preceding comments
    # and processing instructions
    toplevel = xml_.getroot()
    while toplevel.getprevious() is not None:
        toplevel = toplevel.getprevious()

    # move through list, covering root Element and any  comments
    # or processing instructions which follow it
    while toplevel is not None:
        append_element(toplevel, nd)
        toplevel = toplevel.getnext()

    nd.b = '<?xml version="%s"?>\n' % (xml_.docinfo.xml_version or '1.0')
    if xml_.docinfo.encoding:
        nd.b = '<?xml version="%s" encoding="%s"?>\n' % (
            xml_.docinfo.xml_version or '1.0', xml_.docinfo.encoding)
    if NSMAP:
        for k in sorted(NSMAP):
            if k:
                nd.b += "%s: %s\n" % (k, NSMAP[k])
            else:
                nd.b += "%s\n" % NSMAP[k]
    nd.b += xml_.docinfo.doctype + '\n'

    c.redraw()

    return nd
示例#2
0
文件: mail.py 项目: yemanadep/Flectra
def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)

    if not html:
        return ''

    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))
    # \r char is converted into &#13;, must remove it
    html = html.replace('&#13;', '')

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')
    html = html.replace('&gt;', '>')
    html = html.replace('&lt;', '<')
    html = html.replace('&amp;', '&')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html
示例#3
0
from lxml import etree

parser = etree.HTMLParser()
tree = etree.parse("app.html", parser)

name_xpath_1 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[2]/div[2]/div/div[3]/text()'
name_xpath_2 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[1]/div[2]/div/div[3]/text()'

name_1 = tree.xpath(name_xpath_1)
name_2 = tree.xpath(name_xpath_2)

print(name_1)
print(type(name_1))
print(name_2)
print(type(name_2))
def extract_next_links(rawDatas):
    global most_outlinks, visited_subdomains
    outputLinks = list()

    for urlResponse in rawDatas:
        outlinks = []

        # The URL base path
        basePath = urlResponse.url

        hostName = urlparse(basePath).hostname
        if hostName not in visited_subdomains:
            visited_subdomains[hostName] = set()

        # The content of the page
        content = urlResponse.content

        # Stops us from trying parse pages with no content or an error
        if not urlResponse.error_message or content:

            # Debug
            if DEBUG_VERY_VERBOSE:
                print "Error Message: ", urlResponse.error_message
                print "Headers: ", urlResponse.headers
                print "Is Redirected: ", urlResponse.is_redirected
                print "Final URL: ", urlResponse.final_url
                print "Content: ", urlResponse.content, "-\n"
                print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"

            try:
                # Loading the DOM with etree
                parser = etree.HTMLParser(recover=True)
                pageDom = etree.parse(StringIO.StringIO(content), parser)

                # Checks for the presence of a base tag
                if pageDom.xpath('//base/@href'):
                    basePath = pageDom.xpath('//base/@href')[0]

                # Extracting all of the links
                for linkPath in pageDom.xpath('//a/@href'):

                    # absolutePath = urljoin(basePath, relativePath)
                    absoluteUrl = urljoin(basePath, linkPath)

                    # Adding link to list
                    outlinks.append(absoluteUrl)
                    visited_subdomains[hostName].add(absoluteUrl)

                #If outlinks is currently empty then assign it new tuple
                if most_outlinks[0] == "None":
                    most_outlinks = (basePath, len(outlinks))
                #If the current tuples outlinks count is lower to current then replace
                elif most_outlinks[1] < len(outlinks):
                    most_outlinks = (basePath, len(outlinks))

                outputLinks += outlinks

            except AssertionError as err:
                # Setting this as a bad link
                urlResponse.bad_url = True

                # might want to set that built in bad within the url object here???
                if DEBUG:
                    print err.message
        else:
            # Setting this as a bad link
            urlResponse.bad_url = True
            if DEBUG:
                print "No content or an error code exists"
    # Debug
    if DEBUG_VERBOSE:
        print "List of found link: ", outputLinks

    return outputLinks
def parsehtml(file,urlbiglist):
    #开始处理url
    for i in range(len(urlbiglist)):
        for j in range(len(urlbiglist[i])):
            # print urlbiglist[i][j]
            for k in range(3):
                try:
                   request = urllib2.Request(url = urlbiglist[i][j],headers=headers)
                   html = urllib2.urlopen(request).read()
                   print "连接成功,跳出循环"
                   break

                except urllib2.HTTPError,e:
                    print "有问题,再连一遍"
                    continue
            #编码问题解决
            char_type = chardet.detect(html)
            print (char_type)
            #再对编码进行一次判断  只要中文字符的网页 若超出范围直接跳出
            language = ['Chinese','']
            print char_type['language']
            print char_type['language'] in language

            if not ( char_type['language'] in language):
                print char_type['language']
                print char_type['language'] in language
                continue
            if(char_type["encoding"]=='GB2312'):
                try:
                     html = html.decode('gbk').encode('utf-8')
                except UnicodeDecodeError,e:
                    print "编码有些问题,已跳过"
                    continue
            else:
                html = unicode(html, char_type["encoding"]).encode("utf-8")
            pagecontent = etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8'))

            #因为每个界面的网页结构不同  所以要查找多种形式

            # 第一种找法  搞一个大的字符串(其中包括空格和空行)
            filecontent = ''
            p1 = pagecontent.xpath('//div[@class="main-content"]')
            print  type(p1)
            print p1
            print ("第一次找")
            for i in range(len(p1)):
                filecontent = filecontent + p1[i].xpath('string()')
            # 去空格  去空行
            filestringcontent = ''
            file.write('\n这是一篇:\n')
            for line in filecontent.splitlines():
                if not line.split():
                    continue
                line = line.strip()#去空格    也是去掉了换行符
                filestringcontent += line
            file.write(filestringcontent)

            # 每一个网页抓完后的标志

            if(len(p1)):
                continue
            print "第二次找"
            p2 = pagecontent.xpath('//body//div//p//text()')

            print p2
            for l in range(len(p2)):
                print '打印每一页的内容====='
                print p2[l]
                print type(p2[l])
                file.write(p2[l])
示例#6
0
from lxml import etree

html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)
示例#7
0
def parse_lagoufile():
    parser = etree.HTMLParser(encoding='utf-8')
    htmlElement = etree.parse('lagou.html', parser=parser)
    print(etree.tostring(htmlElement, encoding='utf-8').decode('utf-8'))
示例#8
0
def main(number, javlibrary_url):
    try:
        htmlcode = get_html('http://' + javlibrary_url +
                            '/ja/vl_searchbyid.php?keyword=' + number).replace(
                                u'\xa0', u' ')
        title = getTitle(htmlcode)
        movie_found = 1
        if title == '':  # 页面为搜索结果页,而不是视频信息页,遍历搜索结果
            movie_found = 0
            html = etree.fromstring(
                htmlcode, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
            count_all = len(
                html.xpath(
                    "//div[@class='videothumblist']/div[@class='videos']/div[@class='video']"
                ))
            for count in range(1, count_all + 1):
                number_get = str(
                    html.xpath(
                        "//div[@class='videothumblist']/div[@class='videos']/div["
                        + str(count) + "]/a/div[1]/text()")).strip(" ['']")
                if number_get == number.upper():
                    url_get = str(
                        html.xpath(
                            "//div[@class='videothumblist']/div[@class='videos']/div["
                            + str(count) + "]/a/@href")).strip(" ['.']")
                    htmlcode = get_html('http://' + javlibrary_url + '/ja' +
                                        url_get).replace(u'\xa0', u' ')
                    movie_found = 1
                    break
        if movie_found == 1:
            try:  # 从dmm获取简介
                dww_htmlcode = get_html(
                    "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=" +
                    number.replace("-", '00'))
            except:
                dww_htmlcode = ''
            actor = getActor(htmlcode)
            number = getNum(htmlcode)
            release = getRelease(htmlcode)
            dic = {
                'actor':
                str(actor).strip(" [',']").replace('\'', ''),
                'title':
                getTitle(htmlcode).replace(
                    '中文字幕', '').replace("\\n", '').replace('_', '-').replace(
                        number, '').strip().replace(' ',
                                                    '-').replace('--', '-'),
                'studio':
                getStudio(htmlcode),
                'publisher':
                getPublisher(htmlcode),
                'outline':
                getOutline(dww_htmlcode).replace('\n', '').replace(
                    '\\n', '').replace('\'', '').replace(',',
                                                         '').replace(' ', ''),
                'runtime':
                getRuntime(htmlcode),
                'director':
                str(getDirector(htmlcode)).replace('----', ''),
                'release':
                release,
                'number':
                number,
                'cover':
                getCover(htmlcode),
                'imagecut':
                1,
                'tag':
                getTag(htmlcode),
                'series':
                '',
                'year':
                getYear(release),
                'actor_photo':
                getActorPhoto(actor),
                'website':
                getWebsite(htmlcode),
                'source':
                'javlibrary.py',
            }
        else:
            dic = {
                'title': '',
                'website': '',
            }
    except:
        if htmlcode == 'ProxyError':
            dic = {
                'title': '',
                'website': 'timeout',
            }
        else:
            dic = {
                'title': '',
                'website': '',
            }
    js = json.dumps(
        dic,
        ensure_ascii=False,
        sort_keys=True,
        indent=4,
        separators=(',', ':'),
    )  # .encode('UTF-8')
    return js
示例#9
0
def getTitle(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(
        html.xpath("//h3[@class='post-title text']/a/text()")).strip(" ['']")
    return result
示例#10
0
def getOutline(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(
        html.xpath("//div[@class='mg-b20 lh4']/text()")).strip(" ['']")
    return result
示例#11
0
def getWebsite(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = 'http:' + str(
        html.xpath('/html/head/meta[@property=\'og:url\']/@content')).strip(
            " ['']")
    return result
示例#12
0
def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = 'http:' + str(
        html.xpath("//img[@id='video_jacket_img']/@src")).strip(" ['']")
    return result
示例#13
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    authors = []
    translators = []
    illustrators = []
    pg_producers = []
    title = args.title.replace("'", "’")

    for author in args.author:
        authors.append({
            "name": author.replace("'", "’"),
            "wiki_url": None,
            "nacoaf_url": None
        })

    if args.translator:
        for translator in args.translator:
            translators.append({
                "name": translator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    if args.illustrator:
        for illustrator in args.illustrator:
            illustrators.append({
                "name": illustrator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    title_string = title
    if authors and authors[0]["name"].lower() != "anonymous":
        title_string += ", by " + _generate_contributor_string(authors, False)

    identifier = ""
    for author in authors:
        identifier += se.formatting.make_url_safe(author["name"]) + "_"

    identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe(
        title)

    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title)

    if translators:
        title_string = title_string + ". Translated by " + _generate_contributor_string(
            translators, False)

        identifier = identifier + "/"

        for translator in translators:
            identifier += se.formatting.make_url_safe(translator["name"]) + "_"

        identifier = identifier.rstrip("_")

    if illustrators:
        title_string = title_string + ". Illustrated by " + _generate_contributor_string(
            illustrators, False)

        identifier = identifier + "/"

        for illustrator in illustrators:
            identifier += se.formatting.make_url_safe(
                illustrator["name"]) + "_"

        identifier = identifier.rstrip("_")

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Get data on authors
    for i, author in enumerate(authors):
        if not args.offline and author["name"].lower() != "anonymous":
            author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url(
                author["name"], True)

    # Get data on translators
    for i, translator in enumerate(translators):
        if not args.offline and translator["name"].lower() != "anonymous":
            translator["wiki_url"], translator[
                "nacoaf_url"] = _get_wikipedia_url(translator["name"], True)

    # Get data on illlustrators
    for i, illustrator in enumerate(illustrators):
        if not args.offline and illustrator["name"].lower() != "anonymous":
            illustrator["wiki_url"], illustrator[
                "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True)

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(pg_metadata_html), parser)

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"):
            pg_ebook_url = regex.sub(r"^//", "https://", node.get("href"))
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for node in dom.xpath(
                "/html/body//td[contains(@property, 'dcterms:subject')]"):
            if node.get("datatype") == "dcterms:LCSH":
                for subject_link in node.xpath("./a"):
                    pg_subjects.append(subject_link.text.strip())

        # Get the PG publication date
        pg_publication_year = None
        for node in dom.xpath("//td[@itemprop='datePublished']"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            dom = etree.parse(
                StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)),
                parser)
            namespaces = {"re": "http://exslt.org/regular-expressions"}

            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]",
                    namespaces=namespaces):
                producers_text = regex.sub(
                    r"^<[^>]+?>", "",
                    etree.tostring(node, encoding=str, with_tail=False))
                producers_text = regex.sub(r"<[^>]+?>$", "", producers_text)

                producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                           "\\1",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"\(.+?\)",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"[\r\n]+",
                                           " ",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r",? and ", ", and ",
                                           producers_text)
                producers_text = producers_text.replace(
                    " and the Online", " and The Online")
                producers_text = producers_text.replace(", and ", ", ").strip()

                pg_producers = [
                    producer.strip()
                    for producer in regex.split(',|;', producers_text)
                ]

            # Try to strip out the PG header
            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./preceding-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # Try to strip out the PG license footer
            for node in dom.xpath(
                    "//*[re:test(text(), 'End of (the )?Project Gutenberg')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./following-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # lxml will but the xml declaration in a weird place, remove it first
            output = regex.sub(r"<\?xml.+?\?>", "",
                               etree.tostring(dom, encoding="unicode"))

            # Now re-add it
            output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output

            # lxml can also output duplicate default namespace declarations so remove the first one only
            output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1",
                               output)

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(output)

        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except Exception as ex:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates
    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("se.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    ebook_wiki_url = None

    if not args.offline and title != "Short Fiction":
        # There's a "Short Fiction" Wikipedia article, so make an exception for that case
        ebook_wiki_url, _ = _get_wikipedia_url(title, False)

    # Pre-fill titlepage.xhtml
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = _generate_contributor_string(
            translators, False)

    if args.illustrator:
        contributors["illustrated by"] = _generate_contributor_string(
            illustrators, False)

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(title,
                                    [author["name"] for author in authors],
                                    contributors, title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_cover_svg(title, [author["name"] for author in authors],
                                title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    # Fill out the colophon
    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace("TITLE", title)

        contributor_string = _generate_contributor_string(authors, True)

        if contributor_string == "":
            colophon_xhtml = colophon_xhtml.replace(
                " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>",
                contributor_string)
        else:
            colophon_xhtml = colophon_xhtml.replace(
                "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string)

        if translators:
            translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>"
            colophon_xhtml = colophon_xhtml.replace(
                "</p>\n\t\t\t<p>This ebook was produced for the<br/>",
                f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>"
            )

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    # Fill out the metadata file
    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        authors_xml = _generate_metadata_contributor_xml(authors, "author")
        authors_xml = authors_xml.replace("dc:contributor", "dc:creator")
        metadata_xml = regex.sub(
            r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>",
            authors_xml,
            metadata_xml,
            flags=regex.DOTALL)

        if translators:
            translators_xml = _generate_metadata_contributor_xml(
                translators, "translator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>",
                translators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if illustrators:
            illustrators_xml = _generate_metadata_contributor_xml(
                illustrators, "illustrator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>",
                illustrators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=cs:http://id.loc.gov/authorities/subjects&q=\"{urllib.parse.quote(subject)}\""
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
示例#14
0
                   "--dry",
                   action="store_true",
                   help="Dry run (do not save output)")
    p.add_argument("-t",
                   "--throttle",
                   action="store_true",
                   help="Throttle requests")
    args = p.parse_args()
    # override print function to only print when verbose is specified
    printv = partial(print_verbose, args.verbose)

    URL = "https://thuisarts.nl/overzicht/onderwerpen"

    printv("[1/5] Getting thuisarts onderwerpen...")
    page = requests.get(URL).text
    tree = etree.parse(StringIO(page), etree.HTMLParser())
    links = tree.xpath('//ul[@class="subject-list"]/li/a')

    # build result list
    results = [{
        "ID": i,
        "title": link.text,
        "link": f'https://thuisarts.nl/{link.get("href")}',
    } for i, link in enumerate(links)]
    printv(f"[2/5] Dumping surface level results to ./thuisarts.yaml...")
    if not args.dry:
        with open("thuisarts.yaml", "w") as f:
            yaml.dump(results, f, allow_unicode=True)

    # scrape the links for each entry in the results list
    printv(f"[3/5] Scraping individual pages...")
示例#15
0
文件: run.py 项目: lentinj/diazo
def main():
    """Called from console script
    """
    op = _createOptionParser(usage=usage)
    op.add_option("-x", "--xsl",
                  metavar="transform.xsl",
                  help="XSL transform",
                  dest="xsl",
                  default=None)
    op.add_option("--path",
                  metavar="PATH",
                  help="URI path",
                  dest="path",
                  default=None)
    op.add_option("--parameters",
                  metavar="param1=val1,param2=val2",
                  help="Set the values of arbitrary parameters",
                  dest="parameters",
                  default=None)
    op.add_option("--runtrace-xml",
                  metavar="runtrace.xml",
                  help="Write an xml format runtrace to file",
                  dest="runtrace_xml",
                  default=None)
    op.add_option("--runtrace-html",
                  metavar="runtrace.html",
                  help="Write an html format runtrace to file",
                  dest="runtrace_html",
                  default=None)
    (options, args) = op.parse_args()

    if len(args) > 2:
        op.error("Wrong number of arguments.")
    elif len(args) == 2:
        if options.xsl or options.rules:
            op.error("Wrong number of arguments.")
        path, content = args
        if path.lower().endswith('.xsl'):
            options.xsl = path
        else:
            options.rules = path
    elif len(args) == 1:
        content, = args
    else:
        op.error("Wrong number of arguments.")
    if options.rules is None and options.xsl is None:
        op.error("Must supply either options or rules")

    if options.trace:
        logger.setLevel(logging.DEBUG)

    runtrace = False
    if options.runtrace_xml or options.runtrace_html:
        runtrace = True

    parser = etree.HTMLParser()
    parser.resolvers.add(RunResolver(os.path.dirname(content)))

    if options.xsl is not None:
        output_xslt = etree.parse(options.xsl)
    else:

        xsl_params = None
        if options.xsl_params:
            xsl_params = split_params(options.xsl_params)

        output_xslt = compile_theme(
            rules=options.rules,
            theme=options.theme,
            extra=options.extra,
            parser=parser,
            read_network=options.read_network,
            absolute_prefix=options.absolute_prefix,
            includemode=options.includemode,
            indent=options.pretty_print,
            xsl_params=xsl_params,
            runtrace=runtrace,
        )

    if content == '-':
        content = sys.stdin

    if options.read_network:
        access_control = AC_READ_NET
    else:
        access_control = AC_READ_FILE

    transform = etree.XSLT(output_xslt, access_control=access_control)
    content_doc = etree.parse(content, parser=parser)
    params = {}
    if options.path is not None:
        params['path'] = "'%s'" % options.path

    if options.parameters:
        for key, value in split_params(options.parameters).items():
            params[key] = quote_param(value)

    output_html = transform(content_doc, **params)
    if isinstance(options.output, basestring):
        out = open(options.output, 'wt')
    else:
        out = options.output
    out.write(str(output_html))

    if runtrace:
        runtrace_doc = diazo.runtrace.generate_runtrace(
            rules=options.rules,
            error_log=transform.error_log)
        if options.runtrace_xml:
            if options.runtrace_xml == '-':
                out = sys.stdout
            else:
                out = open(options.runtrace_xml, 'wt')
            runtrace_doc.write(out, encoding='utf-8',
                               pretty_print=options.pretty_print)
        if options.runtrace_html:
            if options.runtrace_html == '-':
                out = sys.stdout
            else:
                out = open(options.runtrace_html, 'wt')
            out.write(str(diazo.runtrace.runtrace_to_html(runtrace_doc)))

    for msg in transform.error_log:
        if not msg.message.startswith('<runtrace '):
            logger.warn(msg)
示例#16
0
def getStudio(htmlcode):  # 获取厂商
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath(
        '/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
    return result
示例#17
0
    def canBeMultiple(self, weekList, showID):
        url = 'http://www.rte.ie/player/ie/show/' + showID

        showIDs = []

        try:
            parser = etree.HTMLParser(encoding='utf-8')
            tree = etree.parse(url, parser)

            for shows in tree.xpath(
                    '//div[@class="more-videos-pane"]//article[@class="thumbnail-module"]//a[@class="thumbnail-programme-link"]/@href'
            ):
                show_split = shows.rsplit('/', 2)
                show = str(show_split[1])
                showIDs.append(show)

        except (Exception) as exception:
            print('canBeMultiple: getShows: Error getting show numbers: ',
                  exception)
            showIDs.append(showID)

        # If zero we only have 1 show in this category
        if len(showIDs) == 0:
            showIDs.append(showID)

        short = ''
        name = ''
        date1 = ''
        stream = ''
        channel = ''
        icon = ''
        duration = ''

        for show in showIDs:
            newUrl = 'http://feeds.rasset.ie/rteavgen/player/playlist?showId=' + show

            try:
                # Parse the XML with lxml
                tree = etree.parse(newUrl)

                # Find the first element <entry>
                for elem in tree.xpath('//*[local-name() = "entry"]'):
                    # Iterate through the children of <entry>
                    try:
                        stream = str(elem[0].text)
                    except (Exception) as exception:
                        print("canBeMultiple: stream parse error: ", exception)
                        stream = ''

                    try:
                        name_tmp = str(elem[3].text)
                    except (Exception) as exception:
                        print("canBeMultiple: name_tmp parse error: ",
                              exception)
                        name_tmp = ''

                    try:
                        short_tmp = str(elem[4].text)
                    except (Exception) as exception:
                        print("canBeMultiple: short_tmp parse error: ",
                              exception)
                        short_tmp = ''

                    try:
                        channel = str(elem[5].attrib.get('term'))
                    except (Exception) as exception:
                        print("canBeMultiple: channel parse error: ",
                              exception)
                        channel = ''

                    try:
                        millisecs = int(elem[15].attrib.get('ms'))
                    except (Exception) as exception:
                        print("canBeMultiple: millisecs parse error: ",
                              exception)
                        millisecs = 0

                    try:
                        lastDate = datetime.fromtimestamp(
                            mktime(
                                strptime(str(elem[1].text),
                                         "%Y-%m-%dT%H:%M:%S+00:00"))
                        )  #2012-12-31T12:54:29+00:00
                        date_tmp = lastDate.strftime(u"%a %b %d %Y %H:%M")
                        date1 = _("Added: ") + str(date_tmp)
                    except (Exception) as exception:
                        lastDate = datetime.fromtimestamp(
                            mktime(
                                strptime(str(elem[1].text),
                                         "%Y-%m-%dT%H:%M:%S+01:00"))
                        )  #2012-12-31T12:54:29+01:00
                        date_tmp = lastDate.strftime(u"%a %b %d %Y %H:%M")
                        date1 = _("Added: ") + str(date_tmp)
                        print("canBeMultiple: date1 parse error: ", exception)

                    name = checkUnicode(name_tmp)
                    short = checkUnicode(short_tmp)

                    # Calcualte the stream duration
                    duration = _("Duration: ") + str(calcDuration(millisecs))

                    # Only set the Icon if they are enabled
                    if self.showIcon == 'True':
                        try:
                            icon_url = str(elem[22].attrib.get('url'))
                            icon = icon_url[0:-7] + "-261.jpg"
                        except (Exception) as exception:
                            print("canBeMultiple: icon parse error: ",
                                  exception)
                            icon = ''
                    else:
                        icon = ''

                    weekList.append((date1, name, short, channel, stream, icon,
                                     duration, False))

            except (Exception) as exception:
                print("canBeMultiple: Problem parsing data: ", exception)
示例#18
0
def getTag(htmlcode):  # 获取番号
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
    return result.strip(" ['']").replace("'", '').replace(' ', '')
示例#19
0
    def process_message(self, message_json: str) -> bool:
        self.logger.debug(f"processing message {message_json}")

        # parse the JSON SQS message
        add_article_msg = AddArticleMessage.from_json(message_json)

        try:
            # fetch the content from the URL in the message
            resp = requests.get(add_article_msg.url)
        except Exception:
            self.logger.exception(
                f"failed to fetch article at url {add_article_msg.url}")
            return False

        self.logger.debug("simplifying content")
        readable_content = Document(resp.text)

        parser = etree.HTMLParser()
        content_dom = etree.fromstring(readable_content.summary(), parser)

        # create an Article model
        article = Article(add_article_msg.user_id)
        article.url = resp.url

        # extract the title from the content
        self.logger.debug("extracting article title")
        article.title = readable_content.title()

        # extract the images from the content
        self.logger.debug("fetching related content")

        for image in content_dom.iter("img"):
            img_url = image.get("src")
            try:
                # fetch the image by the URL
                self.logger.debug(f"fetching related image at {img_url}")

                img_resp = requests.get(img_url)
                img_key = f"{article.user_id}/articles/{article.article_id}/related/{Fetcher.get_filename_from_url(img_resp.url)}"
            except Exception:
                self.logger.exception(
                    f"failed to fetch related image at url {img_url}")
                continue

            # save the images to S3
            self.logger.debug(
                f"writing image {img_url} to S3 with key {img_key}")

            if not self.file_repository.put(img_key, BytesIO(resp.content)):
                continue

            # create RelatedContent models for each image and add to the Article
            article.related_content.append(
                RelatedContent(resp.headers["Content-Type"], img_key))

            # re-write the content HTML to point to the new image URL
            self.logger.debug(f"re-writing img element with new URL {img_key}")
            image.set("src", img_key)

        # write the content to S3
        content_key = f"{article.user_id}/articles/{article.article_id}/content.html"

        self.logger.debug(f"writing content to S# with key {content_key}")
        if not self.file_repository.put(
                content_key,
                BytesIO(
                    etree.tostring(content_dom.getroottree(),
                                   pretty_print=True,
                                   method="html"))):
            return False

        # update the Article with the content key
        article.content_key = content_key

        # write the Article to Dynamo
        self.logger.debug(
            f"writing article to debug with keys user_id {article.user_id} article_id {article.article_id}"
        )
        if not self.article_repository.put(article):
            return False

        # send a completed message to SQS
        self.logger.debug("writing completed message to SQS")
        if not self.finished_queue_producer.send_message(
                ArticleFetchCompleteMessage(article.user_id,
                                            article.article_id).to_json()):
            return False

        return True
示例#20
0
async def main():
    # done, pending = await asyncio.wait(futures, timeout=5)
    with open('100k.csv', 'w', newline='', encoding='utf-8') as csvfile:
        hp = etree.HTMLParser(encoding='utf-8')
        writer = csv.writer(csvfile)
        writer.writerow([
            'nro_documento', 'nombres', 'apellidos', 'fecha_nacim', 'sexo',
            'tipo_aseg', 'beneficiarios_activos', 'enrolado',
            'vencimiento_de_fe_de_vida', 'nro_titular', 'titular',
            'estado_titular', 'meses_de_aporte_titular', 'vencimiento_titular',
            'ultimo_periodo_abonado_titular'
        ])
        start_time = time.time()
        async with ClientSession() as session:
            sem = asyncio.Semaphore(100)
            futures = [
                asyncio.ensure_future(fetch_data(sem, param, session))
                for param in param_generator
            ]
            for i, future in enumerate(asyncio.as_completed(futures)):
                #print(future.result())
                try:
                    t, ced, result_html = await future
                    root = html.fromstring(result_html, parser=hp)
                    nro_documento = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[2]"
                    )[0].text.strip()
                    nombres = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[3]"
                    )[0].text.strip()
                    apellidos = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[4]"
                    )[0].text.strip()
                    fecha_nacim = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[5]"
                    )[0].text.strip()
                    sexo = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[6]"
                    )[0].text.strip()
                    tipo_aseg = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[7]"
                    )[0].text.strip()
                    beneficiarios_activos = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[8]"
                    )[0].text.strip()
                    enrolado = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[9]"
                    )[0].text.strip()
                    vencimiento_de_fe_de_vida = root.xpath(
                        u"/html/body/center[2]/form/table[2]/tr[2]/td[10]"
                    )[0].text.strip()

                    nro_titular = root.xpath(
                        u"/html/body/center[2]/form/table[3]/tr[2]/td[1]"
                    )[0].text.strip()
                    titular = root.xpath(
                        u"/html/body/center[2]/form/table[3]/tr[2]/td[2]"
                    )[0].text.strip()
                    estado_titular = root.xpath(
                        u"/html/body/center[2]/form/table[3]/tr[2]/td[3]"
                    )[0].text.strip()
                    meses_de_aporte_titular = root.xpath(
                        u"/html/body/center[2]/form/table[3]/tr[2]/td[4]"
                    )[0].text.strip()
                    vencimiento_titular = root.xpath(
                        u"/html/body/center[2]/form/table[3]/tr[2]/td[5]"
                    )[0].text.strip()
                    ultimo_periodo_abonado_titular = root.xpath(
                        u"/html/body/center[2]/form/table[3]/tr[2]/td[6]"
                    )[0].text.strip()

                    print('{}, {}, {} retornado en {:.2f} segundos'.format(
                        nro_documento, nombres, apellidos, t))
                    writer.writerow([
                        nro_documento, nombres, apellidos, fecha_nacim, sexo,
                        tipo_aseg, beneficiarios_activos, enrolado,
                        vencimiento_de_fe_de_vida, nro_titular, titular,
                        estado_titular, meses_de_aporte_titular,
                        vencimiento_titular, ultimo_periodo_abonado_titular
                    ])

                except Exception as e:
                    print("Cedula: %s no existe" % (ced))
                    continue
            t_total = time.time() - start_time
            nr_of_requests = ((stop + 1) - start)
            print("Process took: {:.2f} seconds".format(t_total))
            print("{} requests per second".format(nr_of_requests / t_total))
示例#21
0
文件: lxml_etree.py 项目: kknet/YHZX
from lxml import etree

html = etree.parse('./test.html', etree.HTMLParser())  # open local file
result = html.xpath('//li/a/@href')

# ################# or ####################

parser = etree.HTML(html)  # this html is html text  not file
result = parser.xpath('//li/a/text()')

# contains()
result = parser.xpath('//li[contains(@class, "test")]/a/@value')

result = parser.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')

# position
result = parser.xpath('//li[1]/a/text()')
result = parser.xpath('//li[last()]/a/text()')
result = parser.xpath('//li[last()-2]/a/text()')  # The third from last
result = parser.xpath('//li[position()<3]/a/text()')

# family  i do't know
result = parser.xpath('//li[1]/ancestor::*')
result = parser.xpath('//li[1]/ancestor::div')
result = parser.xpath('//li[1]/attribute::*')
result = parser.xpath('//li[1]/child::a[@href="link1.html"]')
result = parser.xpath('//li[1]/descendant::span')
result = parser.xpath('//li[1]/following::*[2]')
result = parser.xpath('//li[1]/following-sibling::*')
示例#22
0
def html2dom(htmlstr):
    parser = etree.HTMLParser(remove_blank_text=True,
                              remove_comments=True,
                              remove_pis=True)
    domtree = etree.fromstring(htmlstr, parser)
    return etree.ElementTree(domtree)
    print char_type['language'] in language

    if not (char_type['language'] in language):
        print char_type['language']
        print char_type['language'] in language

    if (char_type["encoding"] == 'GB2312'):
        try:
            html = html.decode('gbk').encode('utf-8')
        except UnicodeDecodeError, e:
            print "编码有些问题,已跳过"

    else:
        html = unicode(html, char_type["encoding"]).encode("utf-8")

    pagecontent = etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8'))
    #对页面进行解析
    p = pagecontent.xpath('//body//div//p//text()')
    print type(p)
    print p
    # filecontent = ''
    for i in range(len(p)):
        print  p[i]
        print type(p[i])
        file.write(p[i])
        # filecontent = filecontent+p[i].xpath('string()')
    #去空格
    # filestringcontent = ''
    # for line in filecontent.splitlines():
    #     if not line.split():
    #         continue
示例#24
0
def parms_page(html):
    root = etree.HTML(html, parser=etree.HTMLParser(encoding="utf-8"))
    html_parms = re.findall('', root)
示例#25
0
from lxml import etree

html = etree.parse('./text.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/text()')
print(result)

result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)

result = html.xpath('//li[@class="item-0"]//text()')
print(result)
示例#26
0
    def transform(self, pretty_print=True):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        if etree is None:
            return self.html

        parser = etree.HTMLParser()
        stripped = self.html.strip()
        tree = etree.fromstring(stripped, parser).getroottree()
        page = tree.getroot()
        # lxml inserts a doctype if none exists, so only include it in
        # the root if it was in the original html.
        root = tree if stripped.startswith(tree.docinfo.doctype) else page

        if page is None:
            print repr(self.html)
            raise PremailerError("Could not parse the html")
        assert page is not None

        ##
        ## style selectors
        ##

        rules = []

        for index, style in enumerate(CSSSelector('style')(page)):
            # If we have a media attribute whose value is anything other than
            # 'screen', ignore the ruleset.
            media = style.attrib.get('media')
            if media and media != 'screen':
                continue

            these_rules, these_leftover = self._parse_style_rules(
                style.text, index)
            rules.extend(these_rules)

            parent_of_style = style.getparent()
            if these_leftover:
                style.text = '\n'.join(
                    ['%s {%s}' % (k, v) for (k, v) in these_leftover])
            elif not self.keep_style_tags:
                parent_of_style.remove(style)

        if self.external_styles:
            for stylefile in self.external_styles:
                if stylefile.startswith('http://'):
                    css_body = urllib.urlopen(stylefile).read()
                elif os.path.exists(stylefile):
                    try:
                        f = codecs.open(stylefile)
                        css_body = f.read()
                    finally:
                        f.close()
                else:
                    raise ValueError(u"Could not find external style: %s" %
                                     stylefile)
                these_rules, these_leftover = self._parse_style_rules(
                    css_body, -1)
                rules.extend(these_rules)

        # rules is a tuple of (specificity, selector, styles), where specificity is a tuple
        # ordered such that more specific rules sort larger.
        rules.sort(key=operator.itemgetter(0))

        first_time = []
        first_time_styles = []
        for __, selector, style in rules:
            new_selector = selector
            class_ = ''
            if ':' in selector:
                new_selector, class_ = re.split(':', selector, 1)
                class_ = ':%s' % class_
            # Keep filter-type selectors untouched.
            if class_ in FILTER_PSEUDOSELECTORS:
                class_ = ''
            else:
                selector = new_selector

            sel = CSSSelector(selector)
            for item in sel(page):
                old_style = item.attrib.get('style', '')
                if not item in first_time:
                    new_style = merge_styles(old_style, style, class_)
                    first_time.append(item)
                    first_time_styles.append((item, old_style))
                else:
                    new_style = merge_styles(old_style, style, class_)
                item.attrib['style'] = new_style
                self._style_to_basic_html_attributes(item,
                                                     new_style,
                                                     force=True)

        # Re-apply initial inline styles.
        for item, inline_style in first_time_styles:
            old_style = item.attrib.get('style', '')
            if not inline_style:
                continue
            new_style = merge_styles(old_style, inline_style, class_)
            item.attrib['style'] = new_style
            self._style_to_basic_html_attributes(item, new_style, force=True)

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath('//@class'):
                parent = item.getparent()
                del parent.attrib['class']

        ##
        ## URLs
        ##
        if self.base_url:
            for attr in ('href', 'src'):
                for item in page.xpath("//@%s" % attr):
                    parent = item.getparent()
                    if attr == 'href' and self.preserve_internal_links \
                           and parent.attrib[attr].startswith('#'):
                        continue
                    if not self.base_url.endswith('/'):
                        self.base_url += '/'
                    parent.attrib[attr] = urlparse.urljoin(
                        self.base_url, parent.attrib[attr].strip('/'))

        out = etree.tostring(root, method="html", pretty_print=pretty_print)
        if self.strip_important:
            out = _importants.sub('', out)
        return out
示例#27
0
    def displayHtmlEntry(self, entry, author, nick, url):

        prepend = '''\
<div class="status__prepend">
  <span>
    <a href="%s" class="status__display-name">
      <strong>%s</strong>
    </a>
    shared
  </span>
</div>
'''

        status = '''\
<div class="status">
  <div class="status__header">
    <a class="status__relative-time" href="%s">
      <time class="time-ago" datetime="%s">%s</time>
    </a>
    <a class="status__display-name" href="%s">
      <span class="display-name">
	<strong>%s</strong>
	<span>@%s</span>
      </span>
    </a>
  </div>
  <div class="status__content">%s</div>
</div>
'''

        id = entry.xpath('atom:id/text()',
                         namespaces={"atom": "http://www.w3.org/2005/Atom"})[0]

        updated = entry.xpath(
            'atom:updated/text()',
            namespaces={"atom": "http://www.w3.org/2005/Atom"})[0]

        verb = entry.xpath(
            'activity:verb/text()',
            namespaces={"activity": "http://activitystrea.ms/spec/1.0/"})[0]

        if verb == 'http://activitystrea.ms/schema/1.0/share':

            print(prepend % (id, author))

            author = entry.xpath(
                'activity:object/atom:author/poco:displayName/text()',
                namespaces={
                    "activity": "http://activitystrea.ms/spec/1.0/",
                    "atom": "http://www.w3.org/2005/Atom",
                    "poco": "http://portablecontacts.net/spec/1.0"
                })[0]

            nick = entry.xpath('activity:object/atom:author/atom:name/text()',
                               namespaces={
                                   "activity":
                                   "http://activitystrea.ms/spec/1.0/",
                                   "atom": "http://www.w3.org/2005/Atom"
                               })[0]

            url = entry.xpath('activity:object/atom:author/atom:id/text()',
                              namespaces={
                                  "activity":
                                  "http://activitystrea.ms/spec/1.0/",
                                  "atom": "http://www.w3.org/2005/Atom"
                              })[0]

        content = entry.xpath(
            'atom:content/text()',
            namespaces={"atom": "http://www.w3.org/2005/Atom"})[0]
        parser = etree.HTMLParser()
        tree = etree.fromstring(content, parser)
        content = etree.tostring(tree, encoding='unicode', method='html')

        print(status % (id, updated, updated, url, author, nick, content))
import os
import os.path as op
import re

from lxml import etree

parser = etree.HTMLParser(encoding='utf-8')
from time import sleep
from urllib.parse import urlsplit, parse_qs
import requests_cache

from validators import validate_raw_files, check_products_detection
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, shop_inventory_lw_csv

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH
from custom_browser import CustomDriver
from parse import parse
from ers import clean_xpathd_text

# Init variables and assets
shop_id = 'sip_whisky'
root_url = 'https://sipwhiskey.com'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'USA'

searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True)
brm = BrandMatcher()
</div>
'''

html=etree.HTML(text)
# HTML 类 可以对 text 文本初始化 构造一个 XPath 解析对象 
result=etree.tostring(html)
# tostring() 可以输出修正后的 HTML 代码 
print(result.decode('utf-8'))


也可以直接读取文本文件进行解析 


from lxml import etree
 
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))



text='''
<div>
<url>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link1.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</url>
</div>
    def transform(self, pretty_print=True, **kwargs):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        if etree is None:
            return self.html

        if self.method == 'xml':
            parser = etree.XMLParser(ns_clean=False, resolve_entities=False)
        else:
            parser = etree.HTMLParser()
        stripped = self.html.strip()
        tree = etree.fromstring(stripped, parser).getroottree()
        page = tree.getroot()
        # lxml inserts a doctype if none exists, so only include it in
        # the root if it was in the original html.
        root = tree if stripped.startswith(tree.docinfo.doctype) else page

        if page is None:
            print repr(self.html)
            raise ValueError("Could not parse the html")
        assert page is not None

        ## style tags
        for element in CSSSelector('style,link[rel~=stylesheet]')(page):
            # If we have a media attribute whose value is anything other than
            # 'screen', ignore the ruleset.
            media = element.attrib.get('media')
            if media and media != 'screen':
                continue

            is_style = element.tag == 'style'
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get('href')
                if not href:
                    continue
                css_body = self._load_external(href)

            self._parse_style_rules(css_body)

            parent_of_element = element.getparent()
            if not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        ## explicitly defined external style file
        if self.external_styles:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                self._parse_style_rules(css_body)

        ## styles from element selectors, runs before class selectors
        for elem in page.xpath('//*'):
            if elem.tag in self.rules:
                old_style = elem.attrib.get('style', '')
                new_style = self.rules[elem.tag]
                if old_style:
                    #replace any old property values with new property value
                    old_cleaned_style = ''

                    for property in old_style.split(';'):
                        if len(property.split(':')) != 2:
                            continue
                        else:
                            property_name, property_val = property.split(':')
                            if new_style.find(property_name) < 0:
                                old_cleaned_style += property + ';'
                    new_style = '; '.join([old_cleaned_style, new_style])
                elem.attrib['style'] = new_style

        ## styles from class selectors
        for tag_classes in page.xpath('//@class'):
            tag = tag_classes.getparent()
            tag_classes = [
                '.' + c.strip() for c in tag_classes.split(' ') if c.strip()
            ]
            for tag_class in tag_classes:
                if tag_class in self.rules:
                    old_style = tag.attrib.get('style', '')
                    new_style = self.rules[tag_class]
                    if old_style:
                        #replace any old property values with new property value
                        old_cleaned_style = ''

                        for property in old_style.split(';'):
                            if len(property.split(':')) != 2:
                                continue
                            else:
                                property_name, property_val = property.split(
                                    ':')
                                if new_style.find(property_name) < 0:
                                    old_cleaned_style += property + ';'
                        new_style = '; '.join([old_cleaned_style, new_style])
                    tag.attrib['style'] = new_style

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath('//@class'):
                parent = item.getparent()
                del parent.attrib['class']

        kwargs.setdefault('method', self.method)
        kwargs.setdefault('pretty_print', pretty_print)
        out = etree.tostring(root, **kwargs)
        if self.method == 'xml':
            out = _cdata_regex.sub(
                lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out)
        if self.strip_important:
            out = _importants.sub('', out)
        return out