def manual_add_robot_policies(): # coz some critical sites have invalid robots.txt ## surprised to see SO MANY sites without valid robots.txt! site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /search\n' + 'Disallow: /advanced_search\n') robots_policies['findingaids.library.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n' + 'Disallow: /contact\n' + 'Disallow: /downloads\n' + 'Disallow: /users\n') robots_policies['digitalhub.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n') robots_policies['images.library.northwestern.edu'] = site_rp robots_policies['images.northwestern.edu'] = site_rp robots_policies['media.northwestern.edu'] = site_rp robots_policies['arch.library.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /?*\n') robots_policies['schedule.radiology.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() try: request = urllib2.Request('http://www.ctd.northwestern.edu/robots.txt') response = urllib2.urlopen(request, timeout=5) content = response.read() except: content = 'User-agent: * \n' content += ('Disallow: /courses?*\n') site_rp.parse(content) robots_policies['www.ctd.northwestern.edu'] = site_rp
def __init__(self, info, instance, parent=None): super(CrawlerWorker, self).__init__(parent) self._instance = instance self.running = True self.base_url = info['base_url'] # main url of website self._links_to_crawl = [] # list of links yet to open self.crawled_links = {} # dictionary of links opened/all links self.__parsed_crawled = {} # list of urls and their html pages self.total = 0 # total number of found links self.total_crawled = 0 # total number of valid crawled links in website self.max_pages = info['max_crawl'] # max pages to crawl self.invalid_links_count = 0 # number of broken links found self.invalid_links_list = [] # list of broken links found self.dynamic = [] self.info = info self.login_url = info['login_url'] # login page url if available if info['robo_url']: self._rb_parser = RobotExclusionRulesParser() self._rb_parser.fetch(info['robo_url']) self._user_agent = 'WASecBot' else: self._rb_parser = None self.browser = browser.RoboBrowser(parser="html.parser", user_agent="WASecBot") self.browser.session.verify = False self._logged_in = False self.running = True self._instance.btncrawlcancel.clicked.connect(self.pause) self._elapsed = 0 self.delay = 15 self._requests = 0 self.start = None urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def __get_robot_handler(url): rp = RobotExclusionRulesParser() if Util.is_url(url): # get the original base url base_url = Util.get_base_url(url) page = requests.get(urljoin(base_url, 'robots.txt')) rp.fetch(urljoin(base_url, 'robots.txt')) return rp
def __init__(self, robots_doc): ''' Initialize from database document representation. ''' self._updated_at = robots_doc['updated_at'] self._robots = RobotExclusionRulesParser() if robots_doc['file'] is not None: try: self._robots.parse(robots_doc['file']) except: pass
def __init__(self, content=None, expires=None): super(RerpWrapper, self).__init__(content, expires) if content: self.parser = RobotExclusionRulesParser() self.parser.use_local_time = False self.parser.expiration_date = self.expires self.parser.parse(content) else: self.parser = None self.my_super = super(RerpWrapper, self)
def crawl_website(website): website.update_robots_txt() # only updates if necessary rules = RobotExclusionRulesParser() rules.parse(website.robots_content) # TODO add check for site last updated timestamp # Has the index been retrieved yet? if not website.webpage_set.exists(): # get index if rules.is_allowed('*', '/'): webpage = Webpage.objects.create( local_url='/', robots_allowed=True, website=website, ) crawl_existing_webpage(webpage, rules) else: # create a placeholder index webpage webpage = Webpage.objects.create( local_url='/', robots_allowed=False, website=website, ) print 'Robots not allowed to index root' return None # Are there webpages to be accessed? allowed_webpages = website.webpage_set.filter(robots_allowed=True) if not allowed_webpages.exists(): # print 'no allowed webpages found for {website}'.format(website=website.url) return None # Are there new links to try out? new_webpages = allowed_webpages.filter(exists=None) if new_webpages.exists(): # start with the oldest first # created and updated are the same for newly-created webpages webpage = new_webpages.order_by('created').first() print 'crawling new' return crawl_existing_webpage(webpage, rules) # Crawl an existing webpage if rules.is_allowed('*', '/foo.html'): webpage = allowed_webpages.filter( exists=True).order_by('updated').first() print 'crawling existing' return crawl_existing_webpage(webpage, rules)
def allowed_url(self): #FIXME: Should use the geturl address as it may have been redirected scheme, netloc, path, query, fragment = urlsplit(self.url) robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""]) #FIXME: Should cache robots.txt in a better persistent data structure if robot_url in ROBOT_CACHE: rp = ROBOT_CACHE[robot_url] else: rp = RobotExclusionRulesParser() try: rp.fetch(robot_url) # Currently if there's a problem we assume there is no robots.txt except IOError: # Should be catching the urllib2.URLError exception logging.debug("Couldn't retrieve robots.txt for %s" % robot_url) rp = None except UnicodeDecodeError: logging.debug("Unicode decode error for robots.txt at %s" % robot_url) rp = None except httplib.HTTPException: logging.debug("Generic HTTPException for robots.txt at %s" % robot_url) rp = None ROBOT_CACHE[robot_url] = rp if rp is None or rp.is_allowed("*", self.url): base_url = urlunsplit([scheme, netloc, "", "", ""]) # If there's a current delay on the site respect robots.txt and stall if self.db.exists(netloc): logging.debug("Obeying robot overlord for %s..." % netloc) URLHandler.add_to_busy(self.db, self.url) return False # Set a delay for any other requests to this site to respect robots.txt delay = rp.get_crawl_delay("*") if rp else None if delay: delay = int(math.ceil(float(rp.get_crawl_delay("*")))) else: delay = SETTINGS["DEFAULT_ROBOTS_DELAY"] self.db.setex(netloc, "1", delay) return True else: return False
def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() try: robotstxt_body = robotstxt_body.decode('utf-8') except UnicodeDecodeError: # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # Switch to 'allow all' state. logger.warning("Failure while parsing robots.txt using %(parser)s." " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", {'parser': "RobotExclusionRulesParser"}, exc_info=sys.exc_info(), extra={'spider': self.spider}) robotstxt_body = '' self.rp.parse(robotstxt_body)
def check(self, hostkey, relurl): """ Return True if allowed to fetch, False if not, None if we do not have robots.txt for this entry. """ robotstxt, expiration = self.robots.get(hostkey, (None, None)) if robotstxt is None: return None # FIXME: mtime? we need to let robots.txt expire. robotparser = RobotExclusionRulesParser() if robotsparser.is_expired(): return None robotparser.seturl(hostkey + '/robots.txt') robotparser.parse(robotstxt.splitlines()) return robotparser.can_fetch(hostkey + relurl)
def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() robotstxt_body = decode_robotstxt(robotstxt_body, spider) self.rp.parse(robotstxt_body)
def benchmark_rerp_parser(website): from robotexclusionrulesparser import RobotExclusionRulesParser rp = RobotExclusionRulesParser() rp.parse(website['robotstxt']) for link in website['links']: rp.is_allowed('googlebot', link)
def __init__(self, url): self.url = Url(urljoin(url, '/robots.txt')) self.rerp = RobotExclusionRulesParser() self.rerp.user_agent = 'Mozilla/5.0' self.rerp.fetch(self.url.url())
class Crawler: start_page_url = '' rerp = RobotExclusionRulesParser() cFiles = CrawlerFiles() tld = '' waiting_url_set = set() crawled_url_set = set() bad_url_set = set() find_string_set = set() find_flname_set = set() found_flname_set = set() found_string_set = set() stop_request = False download_chunk_size = 0 conn_timeout = 0 delay = 0 user_agent = '' bad_url_prefix = '->Bad Url ' found_string_prefix = '->Found ' found_flname_prefix = '->Saved ' def __init__(self, save_dir, start_url, find_flname_set, find_string_set, chunk_size, conn_timeout, default_delay, user_agent): logger.info('->Starting RERP') Crawler.rerp.fetch(start_url + '/robots.txt') Crawler.user_agent = user_agent delay = Crawler.rerp.get_crawl_delay(Crawler.user_agent) Crawler.conn_timeout = conn_timeout if delay is None: Crawler.delay = default_delay else: Crawler.delay = delay Crawler.cFiles = CrawlerFiles(save_dir, start_url) logger.info('->Getting Previous Session files (if any) ') Crawler.crawled_url_set = Crawler.cFiles.get_file_data( Crawler.cFiles.crawled_file) Crawler.found_flname_set = Crawler.cFiles.get_file_data( Crawler.cFiles.found_files_file) Crawler.found_string_set = Crawler.cFiles.get_file_data( Crawler.cFiles.found_strings_file) Crawler.bad_url_set = Crawler.cFiles.get_file_data( Crawler.cFiles.invalid_file) Crawler.waiting_url_set = Crawler.cFiles.get_file_data( Crawler.cFiles.waiting_file) info = Crawler.cFiles.get_file_data(Crawler.cFiles.info_file) Crawler.start_page_url = start_url Crawler.tld = url_func.return_tld(start_url) Crawler.find_string_set = find_string_set Crawler.find_flname_set = find_flname_set Crawler.download_chunk_size = chunk_size logger.info('Crawler Initiated') logger.info('->Loading Website Info') logger.debug('* ' * 20 + 'Website Info' + '* ' * 20) if info is None: info = url_func.get_domain_info(Crawler.tld) Crawler.cFiles.set_file_data(Crawler.cFiles.info_file, info) for key in info: val = info[key] if val: logger.debug("%-20s : %s" % (str(key).upper(), str(val))) logger.debug('* ' * 40) @staticmethod def crawl_page(t_name, page_url): # noinspection PyBroadException try: logger.debug("%s - %s" % (t_name, page_url)) if not Crawler.rerp.is_allowed(Crawler.user_agent, page_url): logger.debug('->%s not allowed to crawl %s' % (t_name, page_url)) return Crawler.add_urls(page_url) if not Crawler.stop_request: Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) time.sleep(Crawler.delay) except requests.HTTPError as h: string = "HTTP Error %d - %s" % (h.response.status_code, page_url) logger.debug(Crawler.bad_url_prefix + string) Crawler.bad_url_set.add(string) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) except requests.ReadTimeout: string = "Timeout %0.1f secs - %s " % (Crawler.conn_timeout, page_url) logger.debug(Crawler.bad_url_prefix + string) Crawler.bad_url_set.add(string) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) except requests.TooManyRedirects as t: string = "%s - %s" % (t, page_url) logger.debug(Crawler.bad_url_prefix + string) Crawler.bad_url_set.add(string) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) except (requests.ConnectionError, requests.ConnectTimeout): if url_func.check_connection() != url_func.CONNECTION_OK: Crawler.wait(t_name) except Exception: logger.exception('Exception in %s ' % page_url) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) @staticmethod def add_urls(page_url): not_html = False with closing( requests.get(page_url, stream=True, timeout=Crawler.conn_timeout) ) as page: # html code of page type_of_page = page.headers[ 'Content-Type'] # get content type from header of html page page.raise_for_status() if 'html' in type_of_page: # web page soup = BeautifulSoup( page.content, "html.parser") # parse the content of page text = soup.text for string in Crawler.find_string_set: if Crawler.stop_request: # if stop is requested by user return str_url = string + ' ' + page_url if text is not None and string in text: Crawler.found_string_set.add(str_url) logger.debug( '%s %s %s' % (Crawler.found_string_prefix, string, page_url)) for a_tag_content in soup.find_all('a'): if Crawler.stop_request: # if stop is requested by user return url = parse.urljoin(Crawler.start_page_url, a_tag_content.get('href')) if '#' in url: url = url.split('#')[0] if ' ' in url: url = url.replace(' ', '%20') if url_func.return_tld(url) == Crawler.tld: if url not in Crawler.crawled_url_set: Crawler.waiting_url_set.add(url) else: not_html = True if not_html: f_name = page_url.split('/')[-1] download_file = False for string in Crawler.find_flname_set: if Crawler.stop_request: break if string in f_name: download_file = True break if download_file: type_split = type_of_page.split('/') f_dir = Crawler.cFiles.save_dir + '/' + type_split[0] if not dir_exists(f_dir): make_dir(f_dir) Crawler.found_flname_set.add(page_url) Crawler.file_download(page_url, f_dir, f_name) if not Crawler.stop_request: logger.debug('%s %s' % (Crawler.found_flname_prefix, page_url)) # wait @staticmethod def wait(t_name): logger.info('->%s waiting for connection...' % t_name) while True: if Crawler.stop_request: break if url_func.check_connection() == url_func.CONNECTION_OK: break time.sleep(2) @staticmethod def update_files(): logger.info('Updating Files') Crawler.cFiles.set_file_data(Crawler.cFiles.crawled_file, Crawler.crawled_url_set) Crawler.cFiles.set_file_data(Crawler.cFiles.found_files_file, Crawler.found_flname_set) Crawler.cFiles.set_file_data(Crawler.cFiles.found_strings_file, Crawler.found_string_set) Crawler.cFiles.set_file_data(Crawler.cFiles.invalid_file, Crawler.bad_url_set) Crawler.cFiles.set_file_data(Crawler.cFiles.waiting_file, Crawler.waiting_url_set) @staticmethod def file_download(file_url, f_dir, f_name): f_path = get_file_path(f_dir, f_name) # logger.info('Saving ', f_name) dl = file_size(f_path) resume_header = {'Range': 'bytes=%d-' % dl} with closing( requests.get(file_url, stream=True, headers=resume_header, timeout=Crawler.conn_timeout)) as file: tl_str = file.headers.get('content-length') # if there is no content length specified in header website doesnt support resuming mode = 'ab' if tl_str else 'wb' with open(f_path, mode) as handle: for chunk in file.iter_content( chunk_size=Crawler.download_chunk_size): if Crawler.stop_request: # if stop is requested by user return if chunk: handle.write(chunk)
def _parse_robots(self, response): rp = RobotExclusionRulesParser() rp.parse(response.body) self._parsers[urlparse_cached(response).netloc] = rp
def crawl(self, in_url): global global_id, last_update, DOMAIN print("Crawler %d on P#%d: %s" % (self.id, url_ids[in_url], in_url)) try: request = urllib2.Request(in_url) response = urllib2.urlopen(request, timeout=5) real_url = w3lib.url.canonicalize_url(response.geturl()) real_uri = urlparse(real_url) extension = real_uri.path.lower().split('.')[-1] if response.info( ).maintype != 'text' or extension in skip_file_types: content = '' else: content = response.read() except: real_url = in_url content = '' if real_url == in_url: # no redirect soup = BeautifulSoup(content, "html.parser") raw_urls = [link.get('href') for link in soup.find_all('a')] else: # redirect raw_urls = [real_url] out_urls = set() for url in raw_urls: #print('parsing', url) if url is None or len(url) <= 1: continue url = url.strip() if url.startswith('/http://') or url.startswith('/https://'): # why would someone do this? url = url[1:] if url.startswith('mailto:') or url.startswith('mailto@'): continue fixed_url = w3lib.url.canonicalize_url(urljoin(in_url, url)) if len(fixed_url) > 1000: # long urls tend to be wrong urls continue uri = urlparse(fixed_url) if uri.scheme is not None and uri.scheme not in [ 'http', 'https', '' ]: continue if uri.hostname is not None: if not uri.hostname.endswith(DOMAIN): continue elif uri.hostname not in robots_policies: site_rp = RobotExclusionRulesParser() try: site_rp.fetch('http://' + uri.hostname + '/robots.txt', timeout=3) except: print "error with", ('http://' + uri.hostname + '/robots.txt') rp_lock.acquire() robots_policies[uri.hostname] = site_rp rp_lock.release() if not (robots_policies[uri.hostname].is_allowed( "*", fixed_url)): continue extension = uri.path.lower().split('.')[-1] if extension in skip_file_types: continue if 1 < len(extension) < 8 and '/' not in extension: urls_extensions.add(extension) out_urls.add(fixed_url) #print out_urls #get lock write_lock.acquire() out_ids = [] for url in out_urls: if url in url_ids: out_ids.append(url_ids[url]) else: url_ids[url] = global_id out_ids.append(global_id) url_id_file.write('%d\t%s\n' % (global_id, url)) url_id_file.flush() global_id += 1 url_tasks.put(url) transition_file.write('%d\t%s\n' % (url_ids[in_url], str(out_ids))) transition_file.flush() last_update = time.time() write_lock.release() #release lock print('%d urls in total reported by %d' % (global_id, self.id))