Python htmlDecodeEntites示例，pyrobot_utils.htmlDecodeEntites Python示例

示例#1

0

显示文件

文件： int_imdb.py 项目： pyropeter/PyroBot-1G

def asyncServe(privmsg, query, yearhint):
    if yearhint:
        params = urlencode({'title': query,
            'sort': 'num_votes,desc',
            'release_date': "%04i-06-01,%04i-06-01"%(yearhint-1, yearhint+1)})
    else:
        params = urlencode({'title': query,
            'sort': 'num_votes,desc'})
    url = "http://akas.imdb.com/search/title?" + params
    debug("int_imdb", "url", url)
    html = urllib.request.urlopen(url).read(50000).decode("iso-8859-1")
    html = html.partition('<table class="results">')[2]
    if not html:
        debug("int_imdb", "result-table not found.")
        return
    
    ttid = re.search(
        r'<a href="/title/(tt[^/]+)/">', html).group(1)
    title = htmlDecodeEntites(re.search(
        r'<a href="/title/tt[^/]+/">([^<]+)</a>', html).group(1))
    director = htmlDecodeEntites(re.search(
        r'Dir: <a[^>]+>([^<]+)</a>', html).group(1))
    #outline = re.search(
    #    r'<span class="outline">([^>]+)</span>', html)
    #if outline: outline = htmlDecodeEntites(outline.group(1))
    year = re.search(
        r'<span class="year_type">\((\d{4})', html).group(1)
    rating = re.search(
        r'itemprop="ratingValue">([0-9.]+|-)</span>', html).group(1)
    
    
    privmsg.reply_imore("%s (%s) %s [%s] http://www.imdb.com/title/%s/"%(
        title, year, director, rating, ttid))

示例#2

0

显示文件

文件： int_quotetweet.py 项目： pyropeter/PyroBot-1G

def quoteTweet(url, privmsg):
    html = urllib.request.urlopen(url).read(6000).decode("utf8")
    #debug("int_quotetweet", "html", html[:200])
    text = htmlDecodeEntites(htmlDecodeEntites(re.search(
        r'<meta content="([^"]*)" name="description" />',
        html).group(1)))
    user = htmlDecodeEntites(htmlDecodeEntites(re.search(
        r'<meta content="([^"]*)" name="page-user-screen_name" />',
        html).group(1)))
    privmsg.reply("<@%s> %s"%(user, re.sub(r"\s+"," ", text)))

示例#3

0

显示文件

文件： int_youtube.py 项目： pyropeter/PyroBot-1G

def get_youtube_title(url, privmsg):
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        foo = opener.open(url)
        if re.match("text/x?html|application/[^;]*(xhtml|xml)[^;]*",foo.getheader("Content-Type")):
            htmlcode = foo.read(10240)
            
            title = re.search(b"<title[^>]*>(.*?)</title>", htmlcode,
                re.IGNORECASE + re.DOTALL)
            if not title: return False
            title = title.group(1)
            try:
                title = title.decode("utf8")
            except UnicodeDecodeError:
                title = title.decode("iso-8859-1")
            finally:
                if hasattr(title, 'decode'):
                    title = repr(title)[2:-1]
            #title = re.sub("&(\\w+);",lambda match: html.entities.entitydefs.get(match.group(1),"&" + match.group(1) + ";"),title)
            title = htmlDecodeEntites(title)
            title = " ".join(title.split())
        else:
            return ""
    except urllib.error.HTTPError:
        return "(blocked or error)"
    return title

示例#4

0

显示文件

文件： int_chaosradio.py 项目： pyropeter/PyroBot-1G

def quoteTweet(url, privmsg):
    url = "http://chaosradio.de/%s.html"%url.lower()
    #debug("int_chaosradio", "url", url)
    html = urllib.request.urlopen(url).read(20000).decode("utf8")
    title = htmlDecodeEntites(re.search(
        r'<h1[^>]*>([^<]*)</h1>', html).group(1))
    subtitle = htmlDecodeEntites(re.search(
        r'<p [^ ]*class="subtitle">([^<]*)</p>', html).group(1))
    duration = htmlDecodeEntites(re.search(
        r'<br>Dauer: (\d\d:\d\d:\d\d)h</p>', html).group(1))
    durHour = int(duration[:2])
    durMins = int(duration[3:5]) + int(duration[6:8]) / 60
    published = htmlDecodeEntites(re.search(
        r'Veröffentlicht am: (\d\d.\d\d.\d\d\d\d),', html).group(1))
    pubYear = published[8:10]
    pubMonth = published[3:5]
    privmsg.reply("%s %s: %s [%i:%02i %s.%s]"%(
        url, title, subtitle, durHour, durMins, pubMonth, pubYear))

示例#5

0

显示文件

文件： int_alternativlos.py 项目： pyropeter/PyroBot-1G

def quoteTweet(url, privmsg):
    url = "http://www.alternativlos.org/%s/"%url
    #debug("int_chaosradio", "url", url)
    html = urllib.request.urlopen(url).read(20000).decode("utf8")
    thema = re.sub(r'\s+', " ", htmlDecodeEntites(re.search(
        r'<h.>Them..?</h.>.*?<p>(.*?)<h.>', html, re.DOTALL).group(1)))
    try:
        gästestr = re.sub(r'\s+', " ", htmlDecodeEntites(re.search(
            r'<h.>Gäste</h.>.*?(<p>)?(.*?)<h.>', html, re.DOTALL).group(2)))
        gäste = re.findall(
            r'<a[^>]*>(.*?)</a>', gästestr, re.DOTALL)
    except AttributeError:
        gäste = []
    if gäste:
        privmsg.reply("%s mit %s: %s"%(
            url, ', '.join(gäste), thema))
    else:
        privmsg.reply("%s %s"%(
            url, thema))

示例#6

0

显示文件

文件： int_wiki.py 项目： pyropeter/PyroBot-1G

def getArticle(title, lang):
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    url = "http://%s.wikipedia.org/w/index.php?%s"%(
        lang[0],urllib.parse.urlencode({"search":title}))
    info("int_wiki","opening url",url)
    response = opener.open(url)
    text = response.read()
    #info("int_wiki", str(text[:10000]))
    text = text.decode("utf-8")
    
    if re.search(r'<body class=".*? ns-special', text):
        info("int_wiki", "not an article")
        return 0,0,0
    
    content = re.search(
        "<!-- (?:bodycontent|bodytext) -->(.*?)<!-- (?:/bodycontent|/bodytext) -->", text, re.DOTALL).group(1)
    heading = re.search(
        '<h1 id="firstHeading" class="firstHeading">(.*?)</h1>',
        text, re.DOTALL).group(1)
    
    try:
        redirect = re.search(
            r'redirectToFragment\("#(.*?)"\);',
            text,re.DOTALL).group(1)
    except AttributeError:
        redirect = False
    
    info("int_wiki", "redir", redirect)
    if redirect:
        targetBegin = re.search(r'<h\d>(<span .*?</span> )?<span class="mw-headline" id="' + redirect + '">', content).start()
        content = content[targetBegin:]
    
    if re.search('<div id="mw-normal-catlinks">.*?<a[^>]*>%s</a>'%lang[2],text):
        #content = re.sub("&(\\w+);",lambda match: html.entities.entitydefs.get(match.group(1),"&" + match.group(1) + ";"),content)
        content = htmlDecodeEntites(content)
        #links = re.findall('<li>.*?<a href="/wiki/.*?" title="(.*?)"[^>]*>.*?</a>.*?</li>',content)
        links = re.findall('<li>.*?<a href="/wiki/(.*?)"[^>]*>.*?</a>.*?</li>',content)
        links = [urllib.parse.unquote(x.replace("_", " ")) for x in links]
        return 2,links,heading
    else:
        #print(repr(content.encode("utf-8")))
        content = escapearguments(content)
        #etr = etree.fromstring("<foo>"+content+"</foo>")
        etr = etree.fromstring(content)
        definition = ""
        def parseP(ele):
            if ele.text:
                eletext = ele.text
            else:
                eletext = ""
            eletext += etreeToText(ele)
            eletext = eletext.replace(
                "\n"," ").replace(
                "\r"," ").replace(
                "\t"," ").strip()
            return eletext
        for element in etr.getchildren():
            if element.tag == "p":
                definition += parseP(element) + " "
            elif element.tag == "dl":
                definition += parseP(element) + " "
            elif element.tag == "ul":
                for subelement in element.findall("li"):
                    definition += "* " + parseP(subelement) + " "
                definition += "/* "
            elif element.tag == "ol":
                for subelement in element.findall("li"):
                    definition += "# " + parseP(subelement) + " "
                definition += "/# "
            elif re.match("h[2345]",element.tag):
                for subelement in element.findall("span"):
                    level = int(element.tag[1])
                    if subelement.get("class") == "mw-headline":
                        definition += "="*level + parseP(subelement) + "="*level + " "
            elif element.tag == "blockquote":
                definition += parseP(element) + " "
            else:
                pass
        definition = re.sub(r"&(\w+);",lambda match: html.entities.entitydefs.get(match.group(1),"&" + match.group(1) + ";"),definition)
        def repl42(match):
            if match.group(1) and match.group(3):
                return " "
            else:
                return ""
        definition = re.sub(r"(\s*)(\[\s*\]|\(\s*\))(\s*)",repl42,definition)
        # TODO: replace &#xyz; entities
        #print(repr(definition.encode("utf-8")))
        #print(repr(heading.encode("utf-8")))
        if definition.startswith(heading):
            definition = definition[len(heading)+1:]
        return 1,definition,heading