def text(html): """html to text dumb converter cargo-culted from etymology.py""" html = r_tag.sub("", html) html = r_whitespace.sub(" ", html) return web.decode(html.strip())
def tr(bot, trigger): """Translates a phrase, with an optional language hint.""" in_lang, out_lang, phrase = trigger.groups() if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') if phrase.strip() == '': return bot.reply('You need to specify a string for me to translate!') in_lang = in_lang or 'auto' out_lang = out_lang or 'en' if in_lang != out_lang: msg, in_lang = translate(phrase, in_lang, out_lang) if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, in_lang, out_lang) else: msg = 'The %s to %s translation failed, are you sure you specified valid language abbreviations?' % ( in_lang, out_lang) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def find_title(url, verify=True): """Return the title for the given URL.""" response = requests.get(url, stream=True, verify=verify, headers=default_headers) try: content = b'' for byte in response.iter_content(chunk_size=512): content += byte if b'</title>' in content or len(content) > max_bytes: break content = content.decode('utf-8', errors='ignore') finally: # need to close the connexion because we have not read all the data response.close() # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def find_title(url, verify=True): """Return the title for the given URL.""" response = requests.get(url, stream=True, verify=verify, headers=default_headers) try: content = b"" for byte in response.iter_content(chunk_size=512): content += byte if b"</title>" in content or len(content) > max_bytes: break content = content.decode("utf-8", errors="ignore") finally: # need to close the connexion because we have not read all the data response.close() # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r"<\1title>", content) content = quoted_title.sub("", content) start = content.find("<title>") end = content.find("</title>") if start == -1 or end == -1: return title = web.decode(content[start + 7 : end]) title = title.strip()[:200] title = " ".join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub("", title) return title or None
def find_title(url=None, content=None): """Return the title for the given URL. Copy of find_title that allows for avoiding duplicate requests.""" if (not content and not url) or (content and url): raise ValueError("url *or* content needs to be provided to find_title") if url: try: content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes) except UnicodeDecodeError: return # Fail silently when data can't be decoded assert content # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r"<\1title>", content) content = quoted_title.sub("", content) start = content.find("<title>") end = content.find("</title>") if start == -1 or end == -1: return title = web.decode(content[start + 7 : end]) title = title.strip()[:200] title = " ".join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub("", title) return title or None
def find_title(url=None, content=None): """Return the title for the given URL. Copy of find_title that allows for avoiding duplicate requests.""" if (not content and not url) or (content and url): raise ValueError('url *or* content needs to be provided to find_title') if url: try: content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes) except UnicodeDecodeError: return # Fail silently when data can't be decoded assert content # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def find_title(url): """Return the title for the given URL.""" try: content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes) except UnicodeDecodeError: return # Fail silently when data can't be decoded # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def find_title(url): """Return the title for the given URL.""" response = requests.get(url, stream=True) try: content = '' for byte in response.iter_content(chunk_size=512, decode_unicode=True): content += str(byte) if '</title>' in content or len(content) > max_bytes: break except UnicodeDecodeError: return # Fail silently when data can't be decoded finally: # need to close the connexion because we have not read all the data response.close() # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def text(html): '''html to text dumb converter cargo-culted from etymology.py''' html = r_tag.sub('', html) html = r_whitespace.sub(' ', html) return web.decode(html.strip())
def tr(bot, trigger): """Translates a phrase, with an optional language hint.""" in_lang, out_lang, phrase = trigger.groups() if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') if phrase.strip() == '': return bot.reply('You need to specify a string for me to translate!') in_lang = in_lang or 'auto' out_lang = out_lang or 'en' if in_lang != out_lang: msg, in_lang = translate(phrase, in_lang, out_lang) if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, in_lang, out_lang) else: msg = 'The %s to %s translation failed, are you sure you specified valid language abbreviations?' % (in_lang, out_lang) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def fetch_character(query): if not query: return "No search query provided." try: character = requests.get(api + cFilter % query, timeout=(10.0, 4.0)) except requests.exceptions.ConnectTimeout: return "Connection timed out." except requests.exceptions.ConnectionError: return "Could not connect to server." except requests.exceptions.ReadTimeout: return "Server took too long to reply." try: character.raise_for_status() except requests.exceptions.HTTPError as e: return "HTTP error: " + e.message try: Data = character.json() except ValueError: return character.content try: Entry = Data['data'][0] name = Entry['attributes'].get('name') description = web.decode( bleach.clean(Entry['attributes'].get('description').replace( '<br/>', ' ').replace('<br>', ' '), strip=True)) except IndexError: return "No results found." return "{name} - Description: {description}".format( name=name, description=description)
def find_title(url, verify=True): """Return the title for the given URL.""" try: response = requests.get(url, stream=True, verify=verify, headers=default_headers) content = b'' for byte in response.iter_content(chunk_size=512): content += byte if b'</title>' in content or len(content) > max_bytes: break content = content.decode('utf-8', errors='ignore') # Need to close the connection because we have not read all # the data response.close() except requests.exceptions.ConnectionError: return None # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.rfind('<title>') end = content.rfind('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def duck_search(query): query = query.replace('!', '') query = web.quote(query) uri = 'http://duckduckgo.com/html/?q=%s&kl=uk-en' % query bytes = web.get(uri) m = r_duck.search(bytes) if m: return web.decode(m.group(1))
def text(html): text = r_tag.sub('', html).strip() text = text.replace('\n', ' ') text = text.replace('\r', '') text = text.replace('(intransitive', '(intr.') text = text.replace('(transitive', '(trans.') text = web.decode(text) return text
def text(html): text = r_sup.sub('', html) # Remove superscripts that are references from definition text = r_tag.sub('', text).strip() text = text.replace('\n', ' ') text = text.replace('\r', '') text = text.replace('(intransitive', '(intr.') text = text.replace('(transitive', '(trans.') text = web.decode(text) return text
def duck_search(query): query = query.replace('!', '') uri = 'http://duckduckgo.com/html/?q=%s&kl=uk-en' % query bytes = web.get(uri) if 'web-result"' in bytes: # filter out the adds on top of the page bytes = bytes.split('web-result"')[1] m = r_duck.search(bytes) if m: return web.decode(m.group(1))
def duck_search(query): query = query.replace('!', '') uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en' % query bytes = web.get(uri, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}) if 'web-result' in bytes: # filter out the adds on top of the page bytes = bytes.split('web-result')[1] m = r_duck.search(bytes) if m: unquoted_m = unquote(m.group(1)) return web.decode(unquoted_m)
def duck_search(query): query = query.replace("!", "") uri = "http://duckduckgo.com/html/?q=%s&kl=uk-en" % query bytes = web.get(uri) # if 'web-result"' in bytes: # filter out the ads on top of the page # bytes = bytes.split('web-result"')[1] # m = r_duck.search(bytes) # if m: # return web.decode(m.group(1)) urls = [web.decode(x) for x in r_duck.findall(bytes)] return urls
def duck_search(query): query = query.replace('!', '') uri = 'http://duckduckgo.com/html/?q=%s&kl=uk-en' % query bytes = web.get(uri) # if 'web-result"' in bytes: # filter out the ads on top of the page # bytes = bytes.split('web-result"')[1] # m = r_duck.search(bytes) # if m: # return web.decode(m.group(1)) urls = [web.decode(x) for x in r_duck.findall(bytes)] return urls
def gettld(bot, trigger): """Show information about the given Top Level Domain.""" page = requests.get(uri).text tld = trigger.group(2) if not tld: bot.reply("You must provide a top-level domain to search.") return # Stop if no tld argument is provided if tld[0] == '.': tld = tld[1:] search = r'(?i)<td><a href="\S+" title="\S+">\.{0}</a></td>\n(<td><a href=".*</a></td>\n)?<td>([A-Za-z0-9].*?)</td>\n<td>(.*)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n' search = search.format(tld) re_country = re.compile(search) matches = re_country.findall(page) if not matches: search = r'(?i)<td><a href="\S+" title="(\S+)">\.{0}</a></td>\n<td><a href=".*">(.*)</a></td>\n<td>([A-Za-z0-9].*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n' search = search.format(tld) re_country = re.compile(search) matches = re_country.findall(page) if matches: matches = list(matches[0]) i = 0 while i < len(matches): matches[i] = r_tag.sub("", matches[i]) i += 1 desc = matches[2] if len(desc) > 400: desc = desc[:400] + "..." reply = "%s -- %s. IDN: %s, DNSSEC: %s" % ( matches[1], desc, matches[3], matches[4] ) else: search = r'<td><a href="\S+" title="\S+">.{0}</a></td>\n<td><span class="flagicon"><img.*?\">(.*?)</a></td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n' search = search.format(unicode(tld)) re_country = re.compile(search) matches = re_country.findall(page) if matches: matches = matches[0] dict_val = dict() dict_val["country"], dict_val["expl"], dict_val["notes"], dict_val["idn"], dict_val["dnssec"], dict_val["sld"] = matches for key in dict_val: if dict_val[key] == " ": dict_val[key] = "N/A" dict_val[key] = r_tag.sub('', dict_val[key]) if len(dict_val["notes"]) > 400: dict_val["notes"] = dict_val["notes"][:400] + "..." reply = "%s (%s, %s). IDN: %s, DNSSEC: %s, SLD: %s" % (dict_val["country"], dict_val["expl"], dict_val["notes"], dict_val["idn"], dict_val["dnssec"], dict_val["sld"]) else: reply = "No matches found for TLD: {0}".format(unicode(tld)) # Final touches + output reply = web.decode(reply) bot.reply(reply)
def find_title(url): """Return the title for the given URL.""" response = requests.get(url, headers={'User-Agent': 'Sopel IRC Syrup'}, stream=True, verify=True) bs = BeautifulSoup(response.content, "html.parser") if bs.find("meta", property="og:title"): ogtitle = bs.find("meta", property="og:title")['content'] #return title if bs.title: ttitle = bs.title.text if len(ttitle) > len(ogtitle): return ttitle.strip() else: return ogtitle.strip() try: content = '' for byte in response.iter_content(chunk_size=512, decode_unicode=True): if not isinstance(byte, bytes): content += byte else: break if '</title>' in content or len(content) > max_bytes: break except UnicodeDecodeError: return # Fail silently when data can't be decoded finally: # need to close the connexion because we have not read all the data response.close() # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) title = title.replace("[apos]", "'") return title or None
def tr2(bot, trigger): """Translates a phrase, with an optional language hint.""" command = trigger.group(2) if not command: return bot.reply('You did not give me anything to translate') def langcode(p): return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha() args = ['auto', 'en'] for i in range(2): if ' ' not in command: break prefix, cmd = command.split(' ', 1) if langcode(prefix): args[i] = prefix[1:] command = cmd phrase = command if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') if phrase.strip() == '': return bot.reply('You need to specify a string for me to translate!') src, dest = args if src != dest: msg, src = translate(phrase, src, dest, verify_ssl=bot.config.core.verify_ssl) if not src: return bot.say( "Translation failed, probably because of a rate-limit.") if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest) else: msg = 'The %s to %s translation failed, are you sure you specified valid language abbreviations?' % ( src, dest) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def duck_search(query): query = query.replace('!', '') base = 'https://duckduckgo.com/html/' parameters = { 'kl': 'us-en', 'q': query, } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } bytes = requests.get(base, parameters, headers=headers).text if 'web-result' in bytes: # filter out the adds on top of the page bytes = bytes.split('web-result')[1] m = r_duck.search(bytes) if m: unquoted_m = unquote(m.group(1)) return web.decode(unquoted_m)
def tr2(bot, trigger): """Translates a phrase, with an optional language hint.""" command = trigger.group(2) if not command: return bot.reply('You did not give me anything to translate') def langcode(p): return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha() args = ['auto', 'en'] for i in range(2): if ' ' not in command: break prefix, cmd = command.split(' ', 1) if langcode(prefix): args[i] = prefix[1:] command = cmd phrase = command if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') if phrase.strip() == '': return bot.reply('You need to specify a string for me to translate!') src, dest = args if src != dest: msg, src = translate(phrase, src, dest, verify_ssl=bot.config.core.verify_ssl) if not src: return bot.say("Translation failed, probably because of a rate-limit.") if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest) else: msg = 'The %s to %s translation failed, are you sure you specified valid language abbreviations?' % (src, dest) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def tr2(bot, trigger): """Translates a phrase, with an optional language hint.""" command = trigger.group(2) if not command: return bot.reply('You did not give me anything to translate') def langcode(p): return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha() args = ['auto', 'en'] for i in range(2): if ' ' not in command: break prefix, cmd = command.split(' ', 1) if langcode(prefix): args[i] = prefix[1:] command = cmd phrase = command if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') src, dest = args if src != dest: msg, src = translate(phrase, src, dest) if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest) else: msg = 'The %s to %s translation failed, sorry!' % (src, dest) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def tr(bot, trigger): """Translates a phrase, with an optional language hint.""" in_lang, out_lang, phrase = trigger.groups() if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') in_lang = in_lang or 'auto' out_lang = out_lang or 'en' if in_lang != out_lang: msg, in_lang = translate(phrase, in_lang, out_lang) if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s)' % (msg, in_lang, out_lang) else: msg = 'The %s to %s translation failed, sorry!' % (in_lang, out_lang) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def find_title(url): """Return the title for the given URL.""" content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes) content_type = headers.get('Content-Type') or '' encoding_match = re.match('.*?charset *= *(\S+)', content_type) # If they gave us something else instead, try that if encoding_match: try: content = content.decode(encoding_match.group(1)) except: encoding_match = None # They didn't tell us what they gave us, so go with UTF-8 or fail silently. if not encoding_match: try: content = content.decode('utf-8') except: return # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def find_title(url): """Return the title for the given URL.""" response = requests.get(url, stream=True) try: content = '' for byte in response.iter_content(chunk_size=512, decode_unicode=True): if not isinstance(byte, bytes): content += byte else: break if '</title>' in content or len(content) > max_bytes: break except UnicodeDecodeError: return # Fail silently when data can't be decoded finally: # need to close the connexion because we have not read all the data response.close() # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.find('<title>') end = content.find('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def find_title(url, verify=True): """Return the title for the given URL.""" # special cases # youtube for i in range(len(YOUTUBE)): if url[0:len(YOUTUBE[i])] == YOUTUBE[i]: try: response = urllib.request.urlopen( 'https://noembed.com/embed?url=' + url) response_bytes = response.read() response_string = response_bytes.decode("utf8") response.close() youtube_info = json.loads(response_string) return youtube_info['title'] + " | lataaja: " + youtube_info[ 'author_name'] + " | YouTube " except: print("not a valid URL") # twatter for i in range(len(TWITTER)): if url != TWITTER[i] and url[0:len(TWITTER[i])] == TWITTER[i]: # // use open sores twatter client 'Nitter' to get the title url = 'https://nitter.net' + url[len(TWITTER[i]):] # end of special cases try: response = requests.get(url, stream=True, verify=verify, headers=default_headers) content = b'' for byte in response.iter_content(chunk_size=512): content += byte if b'</title>' in content or len(content) > max_bytes: break content = content.decode('utf-8', errors='ignore') # Need to close the connection because we have not read all # the data response.close() except requests.exceptions.ConnectionError: return None # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.rfind('<title>') end = content.rfind('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) # twatter hack title = "Twitter".join(title.rsplit("nitter", 1)) return title or None