def geturls(self): if not self.html: self.fetch() urls = [] from tl.imports import getBeautifulSoup soup = getBeautifulSoup() s = soup.BeautifulSoup(self.html) tags = s('a') for tag in tags: href = tag.get("href") if href: href = href.split("#")[0] if not href: continue if not href.endswith(".html"): continue if ".." in href: continue if href.startswith("mailto"): continue if not "http" in href: if href.startswith("/"): href = self.root + href else: href = self.base + "/" + href if not self.root in href: logging.warn("%s not in %s" % (self.root, href)) ; continue if href not in urls: urls.append(href) logging.warn("found %s urls" % len(urls)) return urls
## tl imports from tl.utils.name import stripname from tl.utils.exception import handle_exception from tl.utils.urldata import UrlData from tl.utils.generic import waitforqueue from tl.utils.url import geturl2, striphtml, Url from tl.lib.datadir import getdatadir from tl.lib.persist import PersistCollection from tl.lib.commands import cmnds from tl.lib.examples import examples from tl.lib.threadloop import ThreadLoop from tl.lib.callbacks import callbacks from tl.imports import getBeautifulSoup soup = getBeautifulSoup() ## basic imports from collections import deque import os import logging import re import sys import time import math import urllib.request, urllib.error, urllib.parse import urllib.parse import optparse from cgi import escape from traceback import format_exc