Exemplo n.º 1
0
Arquivo: url.py Projeto: theduke/jenni
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if "//" in domain:
            domain = domain.split('//')[1]
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemplo n.º 2
0
def tidy_title(title):
    title = title.strip()
    new_title = str()
    for char in title:
        unichar = uc.encode(char)
        if len(list(unichar)) <= 3:
            new_title += unichar
    return new_title
Exemplo n.º 3
0
 def e(m):
     entity = m.group()
     if entity.startswith('&#x'):
         cp = int(entity[3:-1], 16)
         meep = unichr(cp)
     elif entity.startswith('&#'):
         cp = int(entity[2:-1])
         meep = unichr(cp)
     else:
         char = name2codepoint[entity[1:-1]]
         meep = unichr(char)
     try:
         return uc.decode(meep)
     except:
         return uc.decode(uc.encode(meep))
Exemplo n.º 4
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        domain = getTLD(url)
        domain = domain.strip()
        if "//" in domain:
            domain = domain.split('//')[1]
        try:
            ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP)
        except:
            i += 1
            continue
        localhost = False
        for x in ips:
            y = x[4][0]
            if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y:
                localhost = True
                break
        if localhost: break
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            if page_title:
                if IPv4_HOST in page_title or IPv6_HOST in page_title:
                    break
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemplo n.º 5
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if "//" in domain:
            domain = domain.split('//')[1]
        try:
            ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP)
        except:
            i += 1
            continue
        localhost = False
        for x in ips:
            y = x[4][0]
            if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y:
                localhost = True
                break
        if localhost: break
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            if page_title:
                if IPv4_HOST in page_title or IPv6_HOST in page_title:
                    break
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemplo n.º 6
0
Arquivo: url.py Projeto: hodiapa/jenni
def get_results(text):
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = [ ]
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemplo n.º 7
0
def get_results(text):
    if not text:
        return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    passs = False
    while i < k:
        url = uc.encode(a[i][0])
        url = uc.decode(url)
        url = uc.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if '//' in domain:
            domain = domain.split('//')[1]
        if not url.startswith(EXCLUSION_CHAR):
            #passs, page_title = find_title(url)
            passs, page_title = find_title_lite(url)
            display.append([page_title, url])
        i += 1
    return passs, display
Exemplo n.º 8
0
    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if '  ' in x:
            x = x.replace('  ', ' ')
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces(title)

    new_title = str()
    for char in title:
        unichar = uc.encode(char)
        if len(list(uc.encode(char))) <= 3:
            new_title += uc.encode(char)
    title = new_title

    title = re.sub(r'(?i)dcc\ssend', '', title)

    if title:
        return True, title
    else:
        return False, 'No Title'


def remove_nonprint(text):
    new = str()
    for char in text:
Exemplo n.º 9
0
Arquivo: url.py Projeto: theduke/jenni
def find_title(url):
    """
    This finds the title when provided with a string of a URL."
    """
    uri = url

    if not uri and hasattr(self, 'last_seen_uri'):
        uri = self.last_seen_uri.get(origin.sender)

    for item in IGNORE:
        if item in uri:
            return

    if not re.search('^((https?)|(ftp))://', uri):
        uri = 'http://' + uri

    if "twitter.com" in uri:
        uri = uri.replace('#!', '?_escaped_fragment_=')

    redirects = 0
    ## follow re-directs, if someone pastes a bitly of a tinyurl, etc..
    page = str()
    while True:
        req = urllib2.Request(uri, headers={'Accept':'text/html'})
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0')
        u = urllib2.urlopen(req)
        info = u.info()
        page = u.read()
        u.close()

        if not isinstance(info, list):
            status = '200'
        else:
            status = unicode.encode(info[1])
            info = info[0]
        if status.startswith('3'):
            uri = urlparse.urljoin(uri, info['Location'])
        else: break

        redirects += 1
        if redirects >= 50:
            return "Too many re-directs."

    try: mtype = info['content-type']
    except: return
    if not (('/html' in mtype) or ('/xhtml' in mtype)):
        return

    if not page:
        u = urllib2.urlopen(req)
        page = u.read(262144)
        u.close()
    content = page
    regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
    content = regex.sub(r'<\1title>',content)
    regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
    content = regex.sub('',content)
    start = content.find('<title>')
    if start == -1: return
    end = content.find('</title>', start)
    if end == -1: return
    content = content[start+7:end]
    content = content.strip('\n').rstrip().lstrip()
    title = content

    if len(title) > 200:
        title = title[:200] + '[...]'

    def e(m):
        entity = m.group()
        if entity.startswith('&#x'):
            cp = int(entity[3:-1],16)
            return unichr(cp).encode('utf-8')
        elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            return unichr(cp).encode('utf-8')
        else:
            char = name2codepoint[entity[1:-1]]
            return unichr(char).encode('utf-8')

    title = r_entity.sub(e, title)

    if title:
        title = unicode.decode(title)
    else: title = 'None'

    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if "  " in x:
            x = x.replace("  ", " ")
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces (title)

    re_dcc = re.compile(r'(?i)dcc\ssend')
    title = re.sub(re_dcc, '', title)

    if title:
        return title
    else:
        return 'No title'
Exemplo n.º 10
0
def find_title(url):
    """
    This finds the title when provided with a string of a URL."
    """
    uri = url

    if not uri and hasattr(self, 'last_seen_uri'):
        uri = self.last_seen_uri.get(origin.sender)

    for item in IGNORE:
        if item in uri:
            return

    if not re.search('^((https?)|(ftp))://', uri):
        uri = 'http://' + uri

    if "twitter.com" in uri:
        uri = uri.replace('#!', '?_escaped_fragment_=')

    redirects = 0
    ## follow re-directs, if someone pastes a bitly of a tinyurl, etc..
    while True:
        req = urllib2.Request(uri, headers={'Accept':'text/html'})
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0')
        u = urllib2.urlopen(req)
        info = u.info()
        u.close()

        if not isinstance(info, list):
            status = '200'
        else:
            status = unicode.encode(info[1])
            info = info[0]
        if status.startswith('3'):
            uri = urlparse.urljoin(uri, info['Location'])
        else: break

        redirects += 1
        if redirects >= 50:
            return "Too many re-directs."

    try: mtype = info['content-type']
    except: return
    if not (('/html' in mtype) or ('/xhtml' in mtype)):
        return

    u = urllib2.urlopen(req)
    bytes = u.read(262144)
    u.close()
    content = bytes
    regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
    content = regex.sub(r'<\1title>',content)
    regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
    content = regex.sub('',content)
    start = content.find('<title>')
    if start == -1: return
    end = content.find('</title>', start)
    if end == -1: return
    content = content[start+7:end]
    content = content.strip('\n').rstrip().lstrip()
    title = content

    if len(title) > 200:
        title = title[:200] + '[...]'

    def e(m):
        entity = m.group()
        if entity.startswith('&#x'):
            cp = int(entity[3:-1],16)
            return unichr(cp).encode('utf-8')
        elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            return unichr(cp).encode('utf-8')
        else:
            char = name2codepoint[entity[1:-1]]
            return unichr(char).encode('utf-8')

    title = r_entity.sub(e, title)

    if title:
        title = unicode.decode(title)
    else: title = 'None'

    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if "  " in x:
            x = x.replace("  ", " ")
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces (title)

    re_dcc = re.compile(r'(?i)dcc\ssend')
    title = re.sub(re_dcc, '', title)

    if title:
        return title
    else:
        return 'No title'