def crawling(url): try: response = request(url) content = checks.page_encoding(response, action="decode") match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) if match: content = "<html>%s</html>" % match.group(1) soup = BeautifulSoup(content) tags = soup('a') if not tags: tags = [] tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content) tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content) for tag in tags: href = tag.get("href") if hasattr(tag, settings.HTTPMETHOD.GET) else tag.group("href") if href: href = _urllib.parse.urljoin(url, href) if _urllib.parse.urlparse(url).netloc in href: if not re.search(r"\?(v=)?\d+\Z", href) and not \ re.search(r"(?i)\.(js|css)(\?|\Z)", href) and \ href.split('.')[-1].lower() not in settings.CRAWL_EXCLUDE_EXTENSIONS: if request(href): HREF_LIST.append(href) if len(HREF_LIST) != 0: return list(set(HREF_LIST)) else: if not settings.VERBOSITY_LEVEL >= 2: print(settings.SPACE) warn_msg = "No usable links found." print(settings.print_warning_msg(warn_msg)) raise SystemExit() except (UnicodeEncodeError, ValueError) as e: # for non-HTML files and non-valid links pass
def request(url): # Check if defined POST data if menu.options.data: request = urllib2.Request(url, menu.options.data) else: request = urllib2.Request(url) headers.do_check(request) response = urllib2.urlopen(request) soup = BeautifulSoup(response) return soup
def crawling(url): # Check if defined POST data if menu.options.data: request = urllib2.Request(url, menu.options.data) else: request = urllib2.Request(url) headers.do_check(request) response = urllib2.urlopen(request) html_data = response.read() soup = BeautifulSoup(html_data) href_list = [] for tag in soup.findAll('a', href=True): tag['href'] = urlparse.urljoin(url, tag['href']) o = urlparse.urlparse(url) if o.netloc in tag['href']: href_list.append(tag['href']) return href_list
def request(url): # Check if defined POST data if menu.options.data: request = _urllib.request.Request(url, menu.options.data.encode(settings.UNICODE_ENCODING)) else: request = _urllib.request.Request(url) try: headers.do_check(request) response = _urllib.request.urlopen(request) soup = BeautifulSoup(response) return soup except _urllib.error.URLError as e: pass
def request(url): # Check if defined POST data if menu.options.data: request = urllib2.Request(url, menu.options.data) else: request = urllib2.Request(url) try: headers.do_check(request) response = urllib2.urlopen(request) soup = BeautifulSoup(response) return soup except urllib2.URLError, e: err_msg = "Unable to connect to the target URL " err_msg += "(" + str(e.args[0]).split("] ")[1] + ")." print settings.print_critical_msg(err_msg) raise SystemExit