Exemplo n.º 1
0
def expand_urls(page, urls):
	'''
	expand all urls in the page
	'''
	parent=base_url(page.strip())
	# print 'parent=', parent
	urls=[urljoin(parent, url.strip()) for url in urls]
	rets=[]
	for url in urls:
		try:
			nurl=normalize_url(url)
		except Exception, e:
			print 'error when normalize_url %s : %s', (url, e)
			continue
		rets.append(nurl)
Exemplo n.º 2
0
    def get_urls(self, url, document):
        """
        Gets all the URLs in a document and returns them as absolute URLs.

        url -- The url of the document
        document -- The content of the document
        """
        urls = []
        soup = BeautifulSoup(document)

        for link in soup.find_all('a'):
            href = link.get('href')
            if href is not None:
                try:
                    #Convert relative urls to absolute urls
                    if not href.startswith('http'):
                        href = urljoin(url, href)
                    href = normalize_url(href)

                    urls.append(href)
                except:
                    pass

        return urls
    def get_urls(self, url, document):
        """
        Gets all the URLs in a document and returns them as absolute URLs.

        url -- The url of the document
        document -- The content of the document
        """
        urls = []
        soup = BeautifulSoup(document)

        for link in soup.find_all('a'):
            href = link.get('href')
            if href is not None:
                try:
                    #Convert relative urls to absolute urls
                    if not href.startswith('http'):
                        href = urljoin(url, href)
                    href = normalize_url(href)

                    urls.append(href)
                except:
                    pass

        return urls
Exemplo n.º 4
0
	>>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
	'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

	:param charset: The target charset for the URL if the url was
		    given as unicode string.
	'''
	if isinstance(url, unicode): url =url.encode(charset, 'ignore')
	scheme, netloc, path, qs, anchor = urlparse.urlsplit(url)
	path = urllib.quote(path, '/%')
	qs = urllib.quote_plus(qs, ':&=')
	return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
    
if __name__=='__main__':
	url='http://a.a//../../asd/kk/../../../asd.asd/./ss/./././hsadk...$?1=1#kasjdl-qw'
	print url_merge_dots(url)
	print normalize_url(url)
	
	page='http://jadesoul-home'
	urls=u'''
		http://jadesoul-home/index.php
		http://jadesoul-home/?p=30
		a.html
		a/b/c/d.txt
		a/b/../c/d.txt
		http://a.a//../../asd/kk/../../../asd.asd/./ss/./././hsadk...$?1=1#kasjdl-qw
		https://www.abc.com./a.txt
		http://www.abc.com:80/a.txt
		https://www.abc.com.:8080/a.txt
		ftp://www.abc.com:21/a.txt
		ftp://www.abc.com:21/a.txt
		ftp://www.abc.com:21/ a.txt