def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if "//" in domain: domain = domain.split('//')[1] if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url display.append([page_title, url, bitly]) i += 1 return display
def tidy_title(title): title = title.strip() new_title = str() for char in title: unichar = uc.encode(char) if len(list(unichar)) <= 3: new_title += unichar return new_title
def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1], 16) meep = unichr(cp) elif entity.startswith('&#'): cp = int(entity[2:-1]) meep = unichr(cp) else: char = name2codepoint[entity[1:-1]] meep = unichr(char) try: return uc.decode(meep) except: return uc.decode(uc.encode(meep))
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) domain = getTLD(url) domain = domain.strip() if "//" in domain: domain = domain.split('//')[1] try: ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP) except: i += 1 continue localhost = False for x in ips: y = x[4][0] if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y: localhost = True break if localhost: break if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url if page_title: if IPv4_HOST in page_title or IPv6_HOST in page_title: break display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if "//" in domain: domain = domain.split('//')[1] try: ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP) except: i += 1 continue localhost = False for x in ips: y = x[4][0] if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y: localhost = True break if localhost: break if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url if page_title: if IPv4_HOST in page_title or IPv6_HOST in page_title: break display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): a = re.findall(url_finder, text) k = len(a) i = 0 display = [ ] while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() passs = False while i < k: url = uc.encode(a[i][0]) url = uc.decode(url) url = uc.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if '//' in domain: domain = domain.split('//')[1] if not url.startswith(EXCLUSION_CHAR): #passs, page_title = find_title(url) passs, page_title = find_title_lite(url) display.append([page_title, url]) i += 1 return passs, display
title = title.replace('\n', '') title = title.replace('\r', '') def remove_spaces(x): if ' ' in x: x = x.replace(' ', ' ') return remove_spaces(x) else: return x title = remove_spaces(title) new_title = str() for char in title: unichar = uc.encode(char) if len(list(uc.encode(char))) <= 3: new_title += uc.encode(char) title = new_title title = re.sub(r'(?i)dcc\ssend', '', title) if title: return True, title else: return False, 'No Title' def remove_nonprint(text): new = str() for char in text:
def find_title(url): """ This finds the title when provided with a string of a URL." """ uri = url if not uri and hasattr(self, 'last_seen_uri'): uri = self.last_seen_uri.get(origin.sender) for item in IGNORE: if item in uri: return if not re.search('^((https?)|(ftp))://', uri): uri = 'http://' + uri if "twitter.com" in uri: uri = uri.replace('#!', '?_escaped_fragment_=') redirects = 0 ## follow re-directs, if someone pastes a bitly of a tinyurl, etc.. page = str() while True: req = urllib2.Request(uri, headers={'Accept':'text/html'}) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0') u = urllib2.urlopen(req) info = u.info() page = u.read() u.close() if not isinstance(info, list): status = '200' else: status = unicode.encode(info[1]) info = info[0] if status.startswith('3'): uri = urlparse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 50: return "Too many re-directs." try: mtype = info['content-type'] except: return if not (('/html' in mtype) or ('/xhtml' in mtype)): return if not page: u = urllib2.urlopen(req) page = u.read(262144) u.close() content = page regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) content = regex.sub(r'<\1title>',content) regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE) content = regex.sub('',content) start = content.find('<title>') if start == -1: return end = content.find('</title>', start) if end == -1: return content = content[start+7:end] content = content.strip('\n').rstrip().lstrip() title = content if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1],16) return unichr(cp).encode('utf-8') elif entity.startswith('&#'): cp = int(entity[2:-1]) return unichr(cp).encode('utf-8') else: char = name2codepoint[entity[1:-1]] return unichr(char).encode('utf-8') title = r_entity.sub(e, title) if title: title = unicode.decode(title) else: title = 'None' title = title.replace('\n', '') title = title.replace('\r', '') def remove_spaces(x): if " " in x: x = x.replace(" ", " ") return remove_spaces(x) else: return x title = remove_spaces (title) re_dcc = re.compile(r'(?i)dcc\ssend') title = re.sub(re_dcc, '', title) if title: return title else: return 'No title'
def find_title(url): """ This finds the title when provided with a string of a URL." """ uri = url if not uri and hasattr(self, 'last_seen_uri'): uri = self.last_seen_uri.get(origin.sender) for item in IGNORE: if item in uri: return if not re.search('^((https?)|(ftp))://', uri): uri = 'http://' + uri if "twitter.com" in uri: uri = uri.replace('#!', '?_escaped_fragment_=') redirects = 0 ## follow re-directs, if someone pastes a bitly of a tinyurl, etc.. while True: req = urllib2.Request(uri, headers={'Accept':'text/html'}) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0') u = urllib2.urlopen(req) info = u.info() u.close() if not isinstance(info, list): status = '200' else: status = unicode.encode(info[1]) info = info[0] if status.startswith('3'): uri = urlparse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 50: return "Too many re-directs." try: mtype = info['content-type'] except: return if not (('/html' in mtype) or ('/xhtml' in mtype)): return u = urllib2.urlopen(req) bytes = u.read(262144) u.close() content = bytes regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) content = regex.sub(r'<\1title>',content) regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE) content = regex.sub('',content) start = content.find('<title>') if start == -1: return end = content.find('</title>', start) if end == -1: return content = content[start+7:end] content = content.strip('\n').rstrip().lstrip() title = content if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1],16) return unichr(cp).encode('utf-8') elif entity.startswith('&#'): cp = int(entity[2:-1]) return unichr(cp).encode('utf-8') else: char = name2codepoint[entity[1:-1]] return unichr(char).encode('utf-8') title = r_entity.sub(e, title) if title: title = unicode.decode(title) else: title = 'None' title = title.replace('\n', '') title = title.replace('\r', '') def remove_spaces(x): if " " in x: x = x.replace(" ", " ") return remove_spaces(x) else: return x title = remove_spaces (title) re_dcc = re.compile(r'(?i)dcc\ssend') title = re.sub(re_dcc, '', title) if title: return title else: return 'No title'