def expand_urls(page, urls): ''' expand all urls in the page ''' parent=base_url(page.strip()) # print 'parent=', parent urls=[urljoin(parent, url.strip()) for url in urls] rets=[] for url in urls: try: nurl=normalize_url(url) except Exception, e: print 'error when normalize_url %s : %s', (url, e) continue rets.append(nurl)
def get_urls(self, url, document): """ Gets all the URLs in a document and returns them as absolute URLs. url -- The url of the document document -- The content of the document """ urls = [] soup = BeautifulSoup(document) for link in soup.find_all('a'): href = link.get('href') if href is not None: try: #Convert relative urls to absolute urls if not href.startswith('http'): href = urljoin(url, href) href = normalize_url(href) urls.append(href) except: pass return urls
>>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' :param charset: The target charset for the URL if the url was given as unicode string. ''' if isinstance(url, unicode): url =url.encode(charset, 'ignore') scheme, netloc, path, qs, anchor = urlparse.urlsplit(url) path = urllib.quote(path, '/%') qs = urllib.quote_plus(qs, ':&=') return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) if __name__=='__main__': url='http://a.a//../../asd/kk/../../../asd.asd/./ss/./././hsadk...$?1=1#kasjdl-qw' print url_merge_dots(url) print normalize_url(url) page='http://jadesoul-home' urls=u''' http://jadesoul-home/index.php http://jadesoul-home/?p=30 a.html a/b/c/d.txt a/b/../c/d.txt http://a.a//../../asd/kk/../../../asd.asd/./ss/./././hsadk...$?1=1#kasjdl-qw https://www.abc.com./a.txt http://www.abc.com:80/a.txt https://www.abc.com.:8080/a.txt ftp://www.abc.com:21/a.txt ftp://www.abc.com:21/a.txt ftp://www.abc.com:21/ a.txt