def wikipedia(phenny, input, origterm, lang, to_user=None): origterm = origterm.strip() lang = lang.strip() if not origterm: return phenny.say('Perhaps you meant ".wik Zen"?') section = None if "#" in origterm: origterm, section = origterm.split("#")[:2] section = format_subsection(section) term = format_term(origterm) w = wiki.Wiki(wikiapi % lang, wikiuri % lang, wikisearch % lang) try: result = w.search(term) except web.ConnectionError: error = "Can't connect to en.wikipedia.org ({0})".format( wikiuri.format(term)) return phenny.say(error) if result is not None: #Disregarding [0], the snippet url = result.split("|")[-1] check_posted(phenny, input, url) if to_user: phenny.say(to_user + ', ' + parse_wiki_page(url, term, section)) else: phenny.say(parse_wiki_page(url, term, section)) else: phenny.say( 'Can\'t find anything in Wikipedia for "{0}".'.format(origterm))
def apertium_wiki(phenny, input, origterm, to_nick=None): term = format_term(origterm) try: html = str(web.get(wikiuri.format(term))) except: apiResponse = json.loads( str(web.get(wikisearchuri.format(term, 'title')))) if len(apiResponse['query']['search']): term = apiResponse['query']['search'][0]['title'] html = str(web.get(wikiuri.format(term))) else: apiResponse = json.loads( str(web.get(wikisearchuri.format(term, 'text')))) if len(apiResponse['query']['search']): term = apiResponse['query']['search'][0]['title'] html = str(web.get(wikiuri.format(term))) else: phenny.reply("No wiki results for that term.") return page = lxml.html.fromstring(html) if "#" in origterm: section = format_subsection(origterm.split("#")[1]) text = page.find(".//span[@id='%s']" % section) if text is None: phenny.reply("That subsection does not exist.") return text = text.getparent().getnext() else: paragraphs = page.findall('.//p') if len(paragraphs) > 2: text = page.findall('.//p')[1] else: text = page.findall(".//*[@id='mw-content-text']")[0] sentences = text.text_content().split(". ") sentence = '"' + sentences[0] + '"' maxlength = 430 - len( (' - ' + wikiuri.format(format_term_display(term))).encode('utf-8')) if len(sentence.encode('utf-8')) > maxlength: sentence = sentence.encode('utf-8')[:maxlength].decode( 'utf-8', 'ignore') words = sentence[:-5].split(' ') words.pop() sentence = ' '.join(words) + ' [...]' if hasattr(input, 'sender'): check_posted(phenny, input, wikiuri.format(format_term_display(term))) if to_nick: phenny.say(to_nick + ', ' + sentence + ' - ' + wikiuri.format(format_term_display(term))) else: phenny.say(sentence + ' - ' + wikiuri.format(format_term_display(term)))
def gettitle(phenny, input, uri): if not ':' in uri: uri = 'http://' + uri uri = uri.replace('#!', '?_escaped_fragment_=') if uri.startswith('http://wiki.apertium.org/wiki/'): item = uri[len('http://wiki.apertium.org/wiki/'):] return awik(phenny, re.match(r'(blahblah)?(.*)', item)) if re.match(r'https?://en.wiktionary.org/wiki/(.*)', uri): item = re.match(r'https?://en.wiktionary.org/wiki/(.*)', uri).group(1) return w(phenny, re.match(r'(blahblah)?(.*)', web.unquote(item))) if re.match(r'https?://([a-z]{2,3}).wikipedia.org/wiki/(.*)', uri): match = re.match(r'https?://([a-z]{2,3}).wikipedia.org/wiki/(.*)', uri) lang, page = match.group(1), match.group(2) return wikipedia(phenny, page, lang) parts = uri.split(".") start = parts[0] parts.pop(0) uri = start + "." + web.quote('.'.join(parts)) title = None localhost = [ 'http://localhost/', 'http://localhost:80/', 'http://localhost:8080/', 'http://127.0.0.1/', 'http://127.0.0.1:80/', 'http://127.0.0.1:8080/', 'https://localhost/', 'https://localhost:80/', 'https://localhost:8080/', 'https://127.0.0.1/', 'https://127.0.0.1:80/', 'https://127.0.0.1:8080/', 'http://localhost:', 'https://localhost:', ] for s in localhost: if uri.startswith(s): return #phenny.reply('Sorry, access forbidden.') if not hasattr(phenny.config, 'blacklisted_urls'): phenny.config.blacklisted_urls = [] if not hasattr(phenny.bot, 'blacklisted_urls'): phenny.bot.blacklisted_urls = [] for s in phenny.config.blacklisted_urls: phenny.bot.blacklisted_urls.append(re.compile(s)) for regex in phenny.bot.blacklisted_urls: if regex.match(uri): return try: redirects = 0 while True: try: info = web.head(uri) if not isinstance(info, list): status = '200' else: status = str(info[1]) info = info[0] except web.HTTPError: try: info = requests.get(uri, headers=web.default_headers, verify=True) status = str(info.status_code) info = info.headers except web.HTTPError: return None if status.startswith('3'): uri = urllib.parse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 25: return None try: mtype = info['content-type'] except: return None if not mtype or not (('/html' in mtype) or ('/xhtml' in mtype)): return None try: bytes = web.get(uri) except: return None #bytes = u.read(262144) #u.close() except: return m = r_title.search(bytes) if m: title = m.group(1) title = title.strip() title = title.replace('\t', ' ') title = title.replace('\r', ' ') title = title.replace('\n', ' ') while ' ' in title: title = title.replace(' ', ' ') if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group(0) if entity.startswith('&#x'): cp = int(entity[3:-1], 16) return chr(cp) elif entity.startswith('&#'): cp = int(entity[2:-1]) return chr(cp) else: char = name2codepoint[entity[1:-1]] return chr(char) title = r_entity.sub(e, title) if title: title = title.replace('\n', '') title = title.replace('\r', '') title = "[ {0} ]".format(title) if "posted" in phenny.variables: from modules.posted import check_posted posted = check_posted(phenny, input, uri) if posted: title = "{0} (posted: {1})".format(title, posted) else: title = None return title
def gettitle(phenny, input, uri): if not ':' in uri: uri = 'http://' + uri uri = uri.replace('#!', '?_escaped_fragment_=') title = None localhost = [ 'http://localhost/', 'http://localhost:80/', 'http://localhost:8080/', 'http://127.0.0.1/', 'http://127.0.0.1:80/', 'http://127.0.0.1:8080/', 'https://localhost/', 'https://localhost:80/', 'https://localhost:8080/', 'https://127.0.0.1/', 'https://127.0.0.1:80/', 'https://127.0.0.1:8080/', ] for s in localhost: if uri.startswith(s): return phenny.reply('Sorry, access forbidden.') try: redirects = 0 while True: info = web.head(uri) if not isinstance(info, list): status = '200' else: status = str(info[1]) info = info[0] if status.startswith('3'): uri = urllib.parse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 25: return None try: mtype = info['content-type'] except: return None if not (('/html' in mtype) or ('/xhtml' in mtype)): return None bytes = web.get(uri) #bytes = u.read(262144) #u.close() except: return m = r_title.search(bytes) if m: title = m.group(1) title = title.strip() title = title.replace('\t', ' ') title = title.replace('\r', ' ') title = title.replace('\n', ' ') while ' ' in title: title = title.replace(' ', ' ') if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group(0) if entity.startswith('&#x'): cp = int(entity[3:-1], 16) return chr(cp) elif entity.startswith('&#'): cp = int(entity[2:-1]) return chr(cp) else: char = name2codepoint[entity[1:-1]] return chr(char) title = r_entity.sub(e, title) if title: title = title.replace('\n', '') title = title.replace('\r', '') title = "[ {0} ]".format(title) if "posted" in phenny.variables: from modules.posted import check_posted posted = check_posted(phenny, input, uri) if posted: title = "{0} (posted: {1})".format(title, posted) else: title = None return title