def filter(self, ua): """Remove all of the urls in URLS that UA is not allowed to crawl, and fill in the .crawl_delay and .robots_url properties.""" rules = None for url in sorted(self.urls): robots_url = Robots.robots_url(url) if self.robots_url != robots_url: if self.robots_url is None: try: rules = Robots.fetch(robots_url, headers={ 'User-Agent': ua }).agent(ua) except Exception as e: sys.stderr.write( "warning: failed to fetch and parse {}: {}\n". format(robots_url, e)) rules = DummyAgent() self.robots_url = robots_url self.crawl_delay = rules.delay or 1 else: raise ValueError("robots.txt for {} is {}, not {}".format( url, robots_url, self.robots_url)) if not rules.allowed(url): self.urls.remove(url)
def __init__(self, url, limit_pages_counter=1, storage=False, meta={}, debug=False, user_agent='Spidar/1.1', allow_external_link_crawling=False, selenium_chrome_driver=None): self.__start_url = url parse_location = urlparse(url) self.__initial_domain_name = parse_location.netloc self.__pages = [] self.__url_to_discover = set() self.__url_discovered = set() self.__max_counter_pages = limit_pages_counter self.__storage = storage self.__PATH_STORAGE = '__storage/' self.__PATH_SOURCE = self.__PATH_STORAGE + 'sources/' self.__PATH_INFO = self.__PATH_STORAGE + 'infos/' self.__meta = meta self.__debug = debug self.__user_agent = user_agent self.__allow_external_link_crawling = allow_external_link_crawling self.__selenium_chrome_driver = selenium_chrome_driver self.__selenium_driver = None if self.__selenium_chrome_driver is not None: self.__selenium_driver = webdriver.Chrome( executable_path=self.__selenium_chrome_driver) self.__rp = Robots.fetch(Robots.robots_url(self.__start_url)) if self.__storage: self.__set_up_folders(self.__initial_domain_name)
def filter(self, ua): """Remove all of the urls in URLS that UA is not allowed to crawl, and fill in the .crawl_delay and .robots_url properties.""" rules = None for url in sorted(self.urls): robots_url = Robots.robots_url(url) if self.robots_url != robots_url: if self.robots_url is None: try: rules = Robots.fetch(robots_url, headers={ 'User-Agent': ua }).agent(ua) except Exception as e: sys.stderr.write( "warning: failed to fetch and parse {}: {}\n" .format(robots_url, e)) rules = DummyAgent() self.robots_url = robots_url self.crawl_delay = rules.delay or 1 else: raise ValueError( "robots.txt for {} is {}, not {}" .format(url, robots_url, self.robots_url)) if not rules.allowed(url): self.urls.remove(url)
def _check_robots(url): """Check that our crawler satisfies robot exclusion standard""" try: robot_url = Robots.robots_url(url) parse = robotparser.RobotFileParser() parse.set_url(robot_url) parse.read() return parse.can_fetch('*', url) except: return True
def website_allows_scraping(url): """ Args: url: The URL of the website you are trying to check. Returns: True if the website's policy allows you to scrape. Otherwise False. """ robot_url = Robots.robots_url(url) robot = Robots.fetch(robot_url) return robot.allowed(url, USER_AGENT)
def filter_urls(urls, ua): """Partition URLS (an iterable) into sites, and then filter out all of the urls in each site that UA is not allowed to crawl. Returns a list of Site objects.""" sites = defaultdict(Site) for url in urls: url = canon_url_syntax(url) robots_url = Robots.robots_url(url) sites[robots_url].add(url) for site in sites.values(): site.filter(ua) return sorted(sites.values(), key = lambda s: s.robots_url)
def filter_urls(urls, ua): """Partition URLS (an iterable) into sites, and then filter out all of the urls in each site that UA is not allowed to crawl. Returns a list of Site objects.""" sites = defaultdict(Site) for url in urls: url = canon_url_syntax(url) robots_url = Robots.robots_url(url) sites[robots_url].add(url) for site in sites.values(): site.filter(ua) return sorted(sites.values(), key=lambda s: s.robots_url)
def findSitemap(url): ''' Find the location of an xml sitemap :param url: a URL in string format :param robotUrl: The url string of the robots.txt location :param robots: The parsed robots.txt patterns from a website :param common_locations: a list of common sitemap naming conventions :param discoveredSitemaps: a list to store discovered XML sitemaps :param makeUrl: Parse input URL to create sitemap location guesses :param guessed_sitemap: Mutate input URL into a guessed location :param guess: String of sitemap guess URL :param r: guessed sitemap request object :param guessPath: String of guessed path :param responsePath: string of the returned path from request object :return: A list of discovered XML sitemap URLs ''' # Website is using best practices robotUrl = Robots.robots_url(url) robots = Robots.fetch(robotUrl) if len(list(robots.sitemaps)) == 0: pass else: return list(robots.sitemaps) # Website is not using best practice so take some guesses common_locations = ['/sitemap.xml', '/sitemap_index.xml'] discoveredSitemaps = [] while len(common_locations) > 0: makeUrl = urlparse(url) guessed_sitemap = makeUrl._replace(path=common_locations.pop(0)) guess = guessed_sitemap.geturl() r = requests.get(guess) # match guessed path of response to make sure page resolves guessPath = urlparse(guess).path responsePath = urlparse(r.url).path if r.status_code == 200 and guessPath == responsePath: discoveredSitemaps.append(guess) else: pass return [] if len(discoveredSitemaps) == 0 else discoveredSitemaps
def robots_check(link): """ py:function:: robots_check(link) Checks if webpage is allowed to be crawled :param link: link from crawler_process :type link: string containing the url :returns allow: true if allowed, false if not """ robots = None allow = True url = Robots.robots_url(link) if 'http' in url: try: robots = Robots.fetch(url) except requests.exceptions.SSLError: print("SSLError") allow = False except: allow = False if not robots is None: allow = robots.allowed(link, 'agent') return allow
################################################# ################################################# # fetch content of url while len(url_frontier) != 0: # pop any random url url = url_frontier.pop() try: print("\n---------------------------------------------------------") print("Crawling:", url) print("---------------------------------------------------------") # get crawl delay r = robots_cache.fetch(Robots.robots_url(url))[1] # check if its allowed to crawl that url? If not, then skip this url if not robots_cache.allowed(url, '*'): print("This URL is restricted to be crawled.") continue # insert this link to database cur.execute("INSERT OR IGNORE INTO crawled_urls (url_link) values(?)", (url,)) # if its allowed to crawl, then get the crawling delay crawl_delay = r.agent("*").delay if crawl_delay is not None: time.sleep(crawl_delay) else:
def get_robot_instance(url): txt_location = Robots.robots_url(url) return Robots.fetch(txt_location)
def crawl(thread_id, conn): global is_running, robots_cache, history_cache # Init print("===Thread " + str(thread_id) + " started===") f_info = open("info.log", "a", encoding="utf-8") f_link = open("onclick.log", "a", encoding="utf-8") f_debug = open("debug.log", "a", encoding="utf-8") # Create Selenium webdriver and set User-Agent profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", USER_AGENT) # Set User-Agent profile.set_preference("devtools.jsonview.enabled", False) # Disable JSON viewer profile.set_preference("dom.enable_window_print", False) # Disable window.print() driver = wirewebdriver.Firefox(profile) driver.set_page_load_timeout(SELENIUM_TIMEOUT) # Create DB connection cursor cur = conn.cursor() # Save most recent page_url for debugging page_url = None # Wrap main loop in try/catch block for exception handling try: ################### # Main loop start # ################### while is_running: # Get next element in frontier if not thread_active[thread_id]: time.sleep(0.5) # Add small delay while waiting with frontier_lock: cur.execute( "SELECT id, url FROM crawler.page WHERE page_type_code = 'FRONTIER' ORDER BY accessed_time ASC" ) res = cur.fetchone() if res != None: cur.execute( "UPDATE crawler.page SET page_type_code = 'PROCESSING' WHERE id = %s", (res[0], )) frontier_element = res if frontier_element == None: thread_active[thread_id] = False if not any(thread_active): break continue thread_active[thread_id] = True # Get URL and domain page_id, page_url = frontier_element # print(f"[{thread_id}] Processing URL: " + page_url) page_url_parsed = urllib.parse.urlparse(page_url) domain = page_url_parsed.netloc # Get robots.txt robots_url = Robots.robots_url(page_url) robots = robots_agent = None with robots_lock: if robots_url in robots_cache: robots, robots_agent = robots_cache[robots_url] else: try: robots = Robots.fetch( robots_url, headers={"User-Agent": USER_AGENT}) robots_agent = robots.agent(USER_AGENT) # Logging robots.txt f = open("robots.log", "a") if robots.sitemaps: f.write( f"[SITEMAP] {robots.sitemaps} on {domain}\n") if robots_agent.delay != None: f.write( f"[CRAWL DELAY] {robots_agent.delay} on domain {domain}\n" ) f.close() except: # Error fetching robots.txt pass robots_cache[robots_url] = (robots, robots_agent) # Get site data site_id = None cur.execute("SELECT * FROM crawler.site WHERE domain = %s", (domain, )) res = cur.fetchone() if res == None: # Get robots.txt and sitemap.xml content robots_content = curl(robots_url) sitemap_content = None if robots != None and robots.sitemaps != None: # Check sitemaps in robots.txt for sitemap_url in robots.sitemaps: sitemap_content = curl(sitemap_url) if sitemap_content != None: break if sitemap_content == None: # Check /sitemap.xml sitemap_url = f"{page_url_parsed.scheme}://{page_url_parsed.netloc}/sitemap.xml" sitemap_content = curl(sitemap_url) # Insert site into DB cur.execute( "INSERT INTO crawler.site(domain, robots_content, sitemap_content) VALUES (%s, %s, %s) RETURNING id", (domain, robots_content, sitemap_content)) site_id = cur.fetchone()[0] else: # Fetch site from DB site_id = res[0] # Check robots.txt compliance ## Allowed/disallowed if robots_agent != None: if not robots_agent.allowed( page_url): # Mark page as disallowed with frontier_lock: cur.execute( "UPDATE crawler.page " + "SET page_type_code = 'DISALLOWED', accessed_time = now() " + "WHERE id = %s", (page_id, )) continue ## Crawl delay crawl_delay = CRAWL_DELAY if robots_agent != None and robots_agent.delay != None: crawl_delay = robots_agent.delay with history_lock: if domain in history_cache: last_time = history_cache[domain] curr_time = time.time() if curr_time - last_time > crawl_delay: # Update last accessed time history_cache[domain] = curr_time else: # Move page to the back of the frontier with frontier_lock: cur.execute( "UPDATE crawler.page " + "SET page_type_code = 'FRONTIER', accessed_time = now() " + "WHERE id = %s", (page_id, )) time.sleep( 0.1 ) # Small delay before getting next frontier element continue else: history_cache[domain] = time.time() # Get HTTP header http_status_code = None page_content_type = None page_content_disposition = None try: response = requests.head(page_url, allow_redirects=True, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT) http_status_code = response.status_code if "Content-Type" in response.headers: page_content_type = response.headers["Content-Type"] if "Content-Disposition" in response.headers: page_content_disposition = response.headers[ "Content-Disposition"] except requests.exceptions.SSLError: pass except requests.exceptions.Timeout: pass except requests.exceptions.ConnectionError: pass except Exception as e: print( f"[HEAD UNKNOWN] Error getting URL HEAD: {page_url}\n\t{e}" ) f_debug.write( f"[HEAD UNKNOWN] Error getting URL HEAD: {page_url}\n\t{e}\n" ) # Check if page is a binary file if page_content_type != None and not page_content_type.startswith("text/") or \ page_content_disposition != None and not page_content_disposition.startswith("inline"): cur.execute( "UPDATE crawler.page " + "SET site_id = %s, page_type_code = 'BINARY', " + "http_status_code = %s, accessed_time = now() " + "WHERE id = %s", (site_id, http_status_code, page_id)) continue # Get HTTP body try: driver.get(page_url) time.sleep(SELENIUM_WAIT) except TimeoutException: timestamp = datetime.now().strftime("%H:%M:%S") print( f"[GET TIMEOUT] [{timestamp}] Timeout on URL: {page_url}\n" ) f_debug.write( f"[GET TIMEOUT] [{timestamp}] Timeout on URL: {page_url}\n\n" ) with frontier_lock: cur.execute( "UPDATE crawler.page " + "SET site_id = %s, page_type_code = 'UNAVAILABLE', html_content = NULL, " + "html_hash = NULL, http_status_code = NULL, accessed_time = now() " + "WHERE id = %s", (site_id, page_id)) continue except Exception as e: print(f"[GET UNKNOWN] Unable to get URL: {page_url}\n\t{e}") f_debug.write( f"[GET UNKNOWN] Unable to get URL: {page_url}\n\t{e}\n") with frontier_lock: cur.execute( "UPDATE crawler.page " + "SET site_id = %s, page_type_code = 'UNAVAILABLE', html_content = NULL, " + "html_hash = NULL, http_status_code = NULL, accessed_time = now() " + "WHERE id = %s", (site_id, page_id)) continue # Close alert if present try: alert = driver.switch_to.alert alert.dismiss() # alert.accept() f_debug.write(f"[ALERT PRESENT] On URL: {page_url}\n") except NoAlertPresentException as e: pass # Get HTML source html = driver.page_source driver_requests = {} for request in driver.requests: driver_requests[request.url] = request.response # Get html hash and check for duplicates html_hash = str(hash(html)) cur.execute("SELECT id FROM crawler.page WHERE html_hash = %s", (html_hash, )) res = cur.fetchone() if res != None: # Duplicate detected duplicate_page_id = res[0] cur.execute( "UPDATE crawler.page " + "SET site_id = %s, page_type_code = 'DUPLICATE', html_content = NULL, " + "html_hash = NULL, http_status_code = %s, accessed_time = now() " + "WHERE id = %s", (site_id, http_status_code, page_id)) cur.execute( "INSERT INTO crawler.link(from_page, to_page) VALUES (%s, %s)", (page_id, duplicate_page_id)) continue # Update page in DB cur.execute( "UPDATE crawler.page " + "SET site_id = %s, page_type_code = 'HTML', html_content = %s, " + "html_hash = %s, http_status_code = %s, accessed_time = now() " + "WHERE id = %s", (site_id, html, html_hash, http_status_code, page_id)) # Find all href links # <area> link example - https://www.stopbirokraciji.gov.si/ elems = driver.find_elements_by_xpath( "//a[@href] | //area[@href]") # "//body//*[@href]" for elem in elems: try: href = elem.get_attribute("href") if href != None: process_link(cur, page_id, page_url, href, f_info, f_link, f_debug) except StaleElementReferenceException as e: f_debug.write( f"[STALE ELEMENT] On URL: {page_url}\n\tException: {e}\n" ) continue # Check if any non-<a> tags contain href for debugging purposes elems = driver.find_elements_by_xpath("//body//*[@href]") for elem in elems: try: href = elem.get_attribute("href") if elem.tag_name != "a" and elem.tag_name != "area" and href != None and not href.startswith( "#") and not href.startswith("javascript:"): f_debug.write( f"[HREF TAG] <{elem.tag_name}>, href='{href}' on URL {page_url}\n" ) except StaleElementReferenceException as e: f_debug.write( f"[STALE ELEMENT] On URL: {page_url}\n\tException: {e}\n" ) continue # Find all onclick links # document.location, self.location, window.location, location.href elems = driver.find_elements_by_xpath("//*[@onclick]") for elem in elems: onclick = elem.get_attribute("onclick") if onclick != None and onclick.strip() != "": f_link.write(onclick + "\n--------\n") # matches = re.findall(r"((document\.location)|(location\.href)|(self\.location)|(window\.location))(.|\n)*?(;|$)", onclick) matches = re.findall( r"(((document|window|self)\.location|location\.href)[^;]*)", onclick) for match in matches: result = re.search("(\".*\")|('.*')|(`.*`)", match[0]) if result != None: onclick_url = result.group()[1:-1] f_link.write(onclick_url + "\n") process_link(cur, page_id, page_url, onclick_url, f_info, f_link, f_debug) f_link.write("\n\n") # Find all images elems = driver.find_elements_by_xpath("//img[@src]") for elem in elems: img_src = elem.get_attribute("src") if img_src != None: # Parse src img_src_parsed = urllib.parse.urlparse(img_src) img_path = img_src_parsed.path img_query = img_src_parsed.query # Ignore empty src if img_src.strip() == "": continue # Ignore base64 images if img_src.startswith("data:"): continue # Ignore src="#" if img_path == "/" and img_query == "" and img_src.endswith( "#"): continue # Get image name # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img img_exts = [ ".jpg", ".jpeg", ".png", ".svg", ".gif", ".webp", ".apng", ".avif", ".bmp", ".ico", ".cur", ".tif", ".tiff" ] img_name = None img_ext = None for img_ext in img_exts: if img_path.lower().endswith(img_ext): img_name = os.path.basename(img_path) break if img_name == None: img_name = os.path.basename(img_path) if img_name == "": print( f"[IMG NAME ERROR] src {img_src} on URL {page_url}" ) f_debug.write( f"[IMG NAME ERROR] src {img_src} on URL {page_url}\n" ) img_name = None # Get image content type img_url = urllib.parse.urljoin(page_url, img_src) img_content_type = None if img_url in driver_requests: # img_src # Check Selenium request response = driver_requests[img_url] if response and response.status_code == 200 and "Content-Type" in response.headers: img_content_type = response.headers["Content-Type"] else: # Manually send request try: response = requests.head( img_url, allow_redirects=True, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT) if response.status_code == 200 and "Content-Type" in response.headers: img_content_type = response.headers[ "Content-Type"] except requests.exceptions.SSLError: f_debug.write( f"[IMG HEAD SSL] SSL exception on src: {img_url}\n" ) except requests.exceptions.Timeout: f_debug.write( f"[IMG HEAD TIMEOUT] Timeout exception on src: {img_url}\n" ) except requests.exceptions.ConnectionError: f_debug.write( f"[IMG HEAD CONNECTION] Connection error on src: {img_url}\n" ) pass except Exception as e: print( f"[IMG HEAD UNKNOWN] Unknown exception on src: {img_url}\n\tOn page: {page_url}\n\t{e}" ) f_debug.write( f"[IMG HEAD UNKNOWN] Unknown exception on src: {img_url}\n\t{e}\n" ) # Check if file is an image if img_content_type != None: if not img_content_type.startswith("image/"): f_debug.write( f"[IMG CONTENT TYPE] On src: {img_url}\n\tOn url: {page_url}\n" ) continue # If requests failed, get content type from filename if img_content_type == None and img_ext != None: img_content_type = f"image/{img_ext[1:]}" # Error check if img_name == None or img_content_type == None: print( f"[IMG META ERROR] src {img_src} on URL {page_url}" ) f_debug.write( f"[IMG META ERROR] src {img_src} on URL {page_url}\n" ) # Save image metadata to DB cur.execute( "INSERT INTO crawler.image(page_id, filename, content_type) " + "VALUES (%s, %s, %s) ", (page_id, img_name, img_content_type)) ################# # Main loop end # ################# except Exception as e: # Print exception print(f"[{thread_id}] [UNHANDLED EXCEPTION] on URL: {page_url}\n\t{e}") f_debug.write( f"[{thread_id}] [UNHANDLED EXCEPTION] on URL: {page_url}\n\t{e}\n") # Cleanup f_info.close() f_link.close() f_debug.close() thread_active[thread_id] = False cur.close() driver.close() print("===Thread " + str(thread_id) + " finished===")
def __extract_info(self, url): self.__print_debug('crawling page', url) parsed_url = urlparse(url) if parsed_url.netloc == self.__initial_domain_name: if not self.__rp.allowed(url, self.__user_agent): self.__print_debug('disallowed by user agent') return None else: current_robot = Robots.fetch(Robots.robots_url(url)) if not current_robot.allowed(url, self.__user_agent): self.__print_debug('disallowed by user agent') return None content, is_html, language = self.__crawl_page(url) if content is None: return None path = urlparse(url).path.replace('/', '_') if path is None or path == '': path = '__index__' if self.__storage: self.__set_up_folders(parsed_url.netloc) fsource = open( self.__PATH_SOURCE + parsed_url.netloc + '/' + path + '.html', 'wb') fsource.write(content) fsource.close() if not is_html: self.__pages.append({ 'content': content, 'language': language, 'url': url, 'html': content }) return content soup = BeautifulSoup(content, 'html.parser') for link in soup.find_all('a'): href = link.get('href') if href is None or '#' in href: continue if href.startswith('http'): self.__add_url(href) continue if href.startswith('mailto'): continue new_url = str(urljoin(url, href)) self.__add_url(new_url) texts = soup.findAll(text=True) visible_texts = filter(self.__tag_visible, texts) visible_texts = ' '.join(t.strip() for t in visible_texts if t.strip() != '') if self.__storage: fout = open( self.__PATH_INFO + parsed_url.netloc + '/' + path + '.json', 'w') fout.write( json.dumps({ 'url': url, 'domain_name': parsed_url.netloc, 'html': content.decode('utf-8'), 'language': language, 'content': visible_texts, 'meta': self.__meta, })) fout.close() self.__pages.append({ 'content': visible_texts, 'language': language, 'url': url, 'html': content })
from reppy.robots import Robots #grab robots url url = Robots.robots_url('https://science.rpi.edu/computer-science') if 'http' in url: #print(url) robots = Robots.fetch(url) #print(robots) print(robots.allowed('https://science.rpi.edu/computer-science/', 'agent')) print(robots.allowed('https://science.rpi.edu/admin/', 'agent'))
def parse_webpages(webpages): for page in webpages: # obtain the robots.txt url r = Robots.robots_url(page) robots = Robots.fetch(r) if (robots.allowed(page, '*')): # sitemaps is a list of all the sitemaps for a website sitemaps = robots.sitemaps sitemaps_list = list(sitemaps) html = requests.get(page) # html of the webpage soup = bs4.BeautifulSoup(html.text, "html.parser") outlinks = soup.find_all("a") # all the outlinks links = [str(i.get('href')) for i in outlinks] outlinks = [str(i) for i in outlinks] docs = [] # the documents on the page for file in links: directory = page.rsplit("/", 1)[0] link = directory + '/' + file # can be expanded to other file types with a comma if file.endswith(('txt', 'md')): if file.startswith(('http://', 'www.')): text = bs4.BeautifulSoup( requests.get(file).text, "html.parser") ext = file.rsplit(".", 1)[-1] text = [file, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) else: text = bs4.BeautifulSoup( requests.get(link).text, "html.parser") ext = link.rsplit(".", 1)[-1] text = [link, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) elif file.endswith(('pdf')): # special case if PDF x = file try: if file.startswith(('http://', 'www.')): pdf = file.rsplit("/", 1)[-1] response = urlopen(file) else: pdf = file.rsplit("/", 1)[-1] # must first check if pdf is found response = urlopen(link) except urllib.error.HTTPError as e: # if 404 error, put 404 as text text = [link, "pdf", "404"] # text = {'link': link, 'ext': 'pdf', 'text': "404"} docs.append(text) else: # otherwise must save the pdf to run pypdf2 file = open(pdf, 'wb') file.write(response.read()) file.close() if x.startswith('http://'): link = x txt = "" file = open(pdf, 'rb') parser = PDFParser(file) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for p in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(p) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): txt += lt_obj.get_text() # close the pdf file file.close() name = [link, "pdf", txt] # name = {'link': link, 'ext': 'pdf', 'text': txt} os.remove(pdf) # remove the saved file when done docs.append(name) docs = [[str(i) for i in lis] for lis in docs] timestamp = datetime.datetime.now().isoformat() output = { 'url': page, 'timestamp': timestamp, 'outlinks': outlinks, 'html': html.text, 'docs': docs, 'sitemaps': sitemaps_list } with Crawling_L_REST.app.app_context(): Crawling_L_REST.add_webpage(output) return output
def can_crawl(self, url: str) -> bool: ret = False robots_url = Robots.robots_url(url) robots = Robots.fetch(robots_url, headers={'user-agent': 'slurper'}) ret = robots.allowed(url, 'slurper') return ret
def __init__(self, seed_url, user_agent): self.seed_url = seed_url self.user_agent = user_agent self.robots_url = Robots.robots_url(seed_url) self.robots = Robots.fetch(self.robots_url) self.accepted_header_content_type = "text/html"