def asyncServe(privmsg, query, yearhint): if yearhint: params = urlencode({'title': query, 'sort': 'num_votes,desc', 'release_date': "%04i-06-01,%04i-06-01"%(yearhint-1, yearhint+1)}) else: params = urlencode({'title': query, 'sort': 'num_votes,desc'}) url = "http://akas.imdb.com/search/title?" + params debug("int_imdb", "url", url) html = urllib.request.urlopen(url).read(50000).decode("iso-8859-1") html = html.partition('<table class="results">')[2] if not html: debug("int_imdb", "result-table not found.") return ttid = re.search( r'<a href="/title/(tt[^/]+)/">', html).group(1) title = htmlDecodeEntites(re.search( r'<a href="/title/tt[^/]+/">([^<]+)</a>', html).group(1)) director = htmlDecodeEntites(re.search( r'Dir: <a[^>]+>([^<]+)</a>', html).group(1)) #outline = re.search( # r'<span class="outline">([^>]+)</span>', html) #if outline: outline = htmlDecodeEntites(outline.group(1)) year = re.search( r'<span class="year_type">\((\d{4})', html).group(1) rating = re.search( r'itemprop="ratingValue">([0-9.]+|-)</span>', html).group(1) privmsg.reply_imore("%s (%s) %s [%s] http://www.imdb.com/title/%s/"%( title, year, director, rating, ttid))
def quoteTweet(url, privmsg): html = urllib.request.urlopen(url).read(6000).decode("utf8") #debug("int_quotetweet", "html", html[:200]) text = htmlDecodeEntites(htmlDecodeEntites(re.search( r'<meta content="([^"]*)" name="description" />', html).group(1))) user = htmlDecodeEntites(htmlDecodeEntites(re.search( r'<meta content="([^"]*)" name="page-user-screen_name" />', html).group(1))) privmsg.reply("<@%s> %s"%(user, re.sub(r"\s+"," ", text)))
def get_youtube_title(url, privmsg): try: opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] foo = opener.open(url) if re.match("text/x?html|application/[^;]*(xhtml|xml)[^;]*",foo.getheader("Content-Type")): htmlcode = foo.read(10240) title = re.search(b"<title[^>]*>(.*?)</title>", htmlcode, re.IGNORECASE + re.DOTALL) if not title: return False title = title.group(1) try: title = title.decode("utf8") except UnicodeDecodeError: title = title.decode("iso-8859-1") finally: if hasattr(title, 'decode'): title = repr(title)[2:-1] #title = re.sub("&(\\w+);",lambda match: html.entities.entitydefs.get(match.group(1),"&" + match.group(1) + ";"),title) title = htmlDecodeEntites(title) title = " ".join(title.split()) else: return "" except urllib.error.HTTPError: return "(blocked or error)" return title
def quoteTweet(url, privmsg): url = "http://chaosradio.de/%s.html"%url.lower() #debug("int_chaosradio", "url", url) html = urllib.request.urlopen(url).read(20000).decode("utf8") title = htmlDecodeEntites(re.search( r'<h1[^>]*>([^<]*)</h1>', html).group(1)) subtitle = htmlDecodeEntites(re.search( r'<p [^ ]*class="subtitle">([^<]*)</p>', html).group(1)) duration = htmlDecodeEntites(re.search( r'<br>Dauer: (\d\d:\d\d:\d\d)h</p>', html).group(1)) durHour = int(duration[:2]) durMins = int(duration[3:5]) + int(duration[6:8]) / 60 published = htmlDecodeEntites(re.search( r'Veröffentlicht am: (\d\d.\d\d.\d\d\d\d),', html).group(1)) pubYear = published[8:10] pubMonth = published[3:5] privmsg.reply("%s %s: %s [%i:%02i %s.%s]"%( url, title, subtitle, durHour, durMins, pubMonth, pubYear))
def quoteTweet(url, privmsg): url = "http://www.alternativlos.org/%s/"%url #debug("int_chaosradio", "url", url) html = urllib.request.urlopen(url).read(20000).decode("utf8") thema = re.sub(r'\s+', " ", htmlDecodeEntites(re.search( r'<h.>Them..?</h.>.*?<p>(.*?)<h.>', html, re.DOTALL).group(1))) try: gästestr = re.sub(r'\s+', " ", htmlDecodeEntites(re.search( r'<h.>Gäste</h.>.*?(<p>)?(.*?)<h.>', html, re.DOTALL).group(2))) gäste = re.findall( r'<a[^>]*>(.*?)</a>', gästestr, re.DOTALL) except AttributeError: gäste = [] if gäste: privmsg.reply("%s mit %s: %s"%( url, ', '.join(gäste), thema)) else: privmsg.reply("%s %s"%( url, thema))
def getArticle(title, lang): opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] url = "http://%s.wikipedia.org/w/index.php?%s"%( lang[0],urllib.parse.urlencode({"search":title})) info("int_wiki","opening url",url) response = opener.open(url) text = response.read() #info("int_wiki", str(text[:10000])) text = text.decode("utf-8") if re.search(r'<body class=".*? ns-special', text): info("int_wiki", "not an article") return 0,0,0 content = re.search( "<!-- (?:bodycontent|bodytext) -->(.*?)<!-- (?:/bodycontent|/bodytext) -->", text, re.DOTALL).group(1) heading = re.search( '<h1 id="firstHeading" class="firstHeading">(.*?)</h1>', text, re.DOTALL).group(1) try: redirect = re.search( r'redirectToFragment\("#(.*?)"\);', text,re.DOTALL).group(1) except AttributeError: redirect = False info("int_wiki", "redir", redirect) if redirect: targetBegin = re.search(r'<h\d>(<span .*?</span> )?<span class="mw-headline" id="' + redirect + '">', content).start() content = content[targetBegin:] if re.search('<div id="mw-normal-catlinks">.*?<a[^>]*>%s</a>'%lang[2],text): #content = re.sub("&(\\w+);",lambda match: html.entities.entitydefs.get(match.group(1),"&" + match.group(1) + ";"),content) content = htmlDecodeEntites(content) #links = re.findall('<li>.*?<a href="/wiki/.*?" title="(.*?)"[^>]*>.*?</a>.*?</li>',content) links = re.findall('<li>.*?<a href="/wiki/(.*?)"[^>]*>.*?</a>.*?</li>',content) links = [urllib.parse.unquote(x.replace("_", " ")) for x in links] return 2,links,heading else: #print(repr(content.encode("utf-8"))) content = escapearguments(content) #etr = etree.fromstring("<foo>"+content+"</foo>") etr = etree.fromstring(content) definition = "" def parseP(ele): if ele.text: eletext = ele.text else: eletext = "" eletext += etreeToText(ele) eletext = eletext.replace( "\n"," ").replace( "\r"," ").replace( "\t"," ").strip() return eletext for element in etr.getchildren(): if element.tag == "p": definition += parseP(element) + " " elif element.tag == "dl": definition += parseP(element) + " " elif element.tag == "ul": for subelement in element.findall("li"): definition += "* " + parseP(subelement) + " " definition += "/* " elif element.tag == "ol": for subelement in element.findall("li"): definition += "# " + parseP(subelement) + " " definition += "/# " elif re.match("h[2345]",element.tag): for subelement in element.findall("span"): level = int(element.tag[1]) if subelement.get("class") == "mw-headline": definition += "="*level + parseP(subelement) + "="*level + " " elif element.tag == "blockquote": definition += parseP(element) + " " else: pass definition = re.sub(r"&(\w+);",lambda match: html.entities.entitydefs.get(match.group(1),"&" + match.group(1) + ";"),definition) def repl42(match): if match.group(1) and match.group(3): return " " else: return "" definition = re.sub(r"(\s*)(\[\s*\]|\(\s*\))(\s*)",repl42,definition) # TODO: replace &#xyz; entities #print(repr(definition.encode("utf-8"))) #print(repr(heading.encode("utf-8"))) if definition.startswith(heading): definition = definition[len(heading)+1:] return 1,definition,heading