def __init__(self, info, instance, parent=None): super(CrawlerWorker, self).__init__(parent) self._instance = instance self.running = True self.base_url = info['base_url'] # main url of website self._links_to_crawl = [] # list of links yet to open self.crawled_links = {} # dictionary of links opened/all links self.__parsed_crawled = {} # list of urls and their html pages self.total = 0 # total number of found links self.total_crawled = 0 # total number of valid crawled links in website self.max_pages = info['max_crawl'] # max pages to crawl self.invalid_links_count = 0 # number of broken links found self.invalid_links_list = [] # list of broken links found self.dynamic = [] self.info = info self.login_url = info['login_url'] # login page url if available if info['robo_url']: self._rb_parser = RobotExclusionRulesParser() self._rb_parser.fetch(info['robo_url']) self._user_agent = 'WASecBot' else: self._rb_parser = None self.browser = browser.RoboBrowser(parser="html.parser", user_agent="WASecBot") self.browser.session.verify = False self._logged_in = False self.running = True self._instance.btncrawlcancel.clicked.connect(self.pause) self._elapsed = 0 self.delay = 15 self._requests = 0 self.start = None urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class RerpRobotParser(RobotParser): def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() try: robotstxt_body = robotstxt_body.decode('utf-8') except UnicodeDecodeError: # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # Switch to 'allow all' state. logger.warning("Failure while parsing robots.txt using %(parser)s." " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", {'parser': "RobotExclusionRulesParser"}, exc_info=sys.exc_info(), extra={'spider': self.spider}) robotstxt_body = '' self.rp.parse(robotstxt_body) @classmethod def from_crawler(cls, crawler, robotstxt_body): spider = None if not crawler else crawler.spider o = cls(robotstxt_body, spider) return o def allowed(self, url, user_agent): user_agent = to_unicode(user_agent) url = to_unicode(url) return self.rp.is_allowed(user_agent, url)
class RobotsTxt: ''' Wrapper around robots.txt parser that adds the date the file was fetched. If the ``robots_file`` is None or cannot be parsed, then it's treated as a highly permissive robots.txt. ''' def __init__(self, robots_doc): ''' Initialize from database document representation. ''' self._updated_at = robots_doc['updated_at'] self._robots = RobotExclusionRulesParser() if robots_doc['file'] is not None: try: self._robots.parse(robots_doc['file']) except: pass def is_allowed(self, user_agent, url): ''' Return True if ``url`` is allowed by this robots.txt file. ''' return self._robots.is_allowed(user_agent, url) def is_older_than(self, age): ''' Return True if this robots file is older than ``age``. ''' return (datetime.now(tzlocal()) - self._updated_at).seconds > age
class RerpWrapper(python_common.web.robots_txt.parser_base.RobotsTxtParser): def __init__(self, content=None, expires=None): super(RerpWrapper, self).__init__(content, expires) if content: self.parser = RobotExclusionRulesParser() self.parser.use_local_time = False self.parser.expiration_date = self.expires self.parser.parse(content) else: self.parser = None self.my_super = super(RerpWrapper, self) def allowed(self, user_agent, url): return self.parser.is_allowed( user_agent, url) if self.parser else self.my_super.allowed( user_agent, url) def delay(self, user_agent): return self.parser.get_crawl_delay( user_agent) if self.parser else self.my_super.delay(user_agent) @property def expired(self): return self.parser.is_expired if self.parser else self.my_super.expired @property def sitemaps(self): return self.parser.sitemaps if self.parser else self.my_super.sitemaps
def __get_robot_handler(url): rp = RobotExclusionRulesParser() if Util.is_url(url): # get the original base url base_url = Util.get_base_url(url) page = requests.get(urljoin(base_url, 'robots.txt')) rp.fetch(urljoin(base_url, 'robots.txt')) return rp
def __init__(self, content=None, expires=None): super(RerpWrapper, self).__init__(content, expires) if content: self.parser = RobotExclusionRulesParser() self.parser.use_local_time = False self.parser.expiration_date = self.expires self.parser.parse(content) else: self.parser = None self.my_super = super(RerpWrapper, self)
def __init__(self, robots_doc): ''' Initialize from database document representation. ''' self._updated_at = robots_doc['updated_at'] self._robots = RobotExclusionRulesParser() if robots_doc['file'] is not None: try: self._robots.parse(robots_doc['file']) except: pass
def __init__(self, url, robots_fetch_timeout, user_agent, logger): self._logger = logger split_url = urlparse(url) split_list = list(split_url) split_list[2] = ROBOTS_FILE #The path at index robots_txt_url = str(urlunparse(tuple(split_list))) robots_filter = RobotExclusionRulesParser() logger.debug("Fetching robots filter from path: %s"%robots_txt_url) robots_filter.fetch(robots_txt_url, robots_fetch_timeout) self._robots_filter = robots_filter self._ua = user_agent
class Robot: def __init__(self, url): self.url = Url(urljoin(url, '/robots.txt')) self.rerp = RobotExclusionRulesParser() self.rerp.user_agent = 'Mozilla/5.0' self.rerp.fetch(self.url.url()) def throttle_time(self): return self.rerp.get_crawl_delay('Mozilla/5.0') def should_block(self, url): return not self.rerp.is_allowed('Mozilla/5.0', url.url())
def load_robot_rules(): """ load rules from the robots.txt if the online online version is not accessible, then the local version is loaded from disk """ rerp = RobotExclusionRulesParser() try: rerp.fetch(urlparse.urljoin(BASE_URL, '/robots.txt')) except: rerp.parse(open('robots.txt', 'r').read()) return rerp
def is_url_allowed(url): """ Returns ``True`` if robots.txt rules for given URL allow fetching it. This function parses the robots rules for given URL (if any) and returns a boolean flag that tells you whether fetching it is allowed. Note that it doesn't test whether the URL exists on the host. :param url: URL to test :returns: ``True`` if URL can be fetched, ``False`` otherwise """ robots = RobotParser() robots.user_agent = UA_STRING robots.fetch(get_robots_url(url)) if robots.response_code != 200: return True return robots.is_allowed(UA_STRING, url)
def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() try: robotstxt_body = robotstxt_body.decode('utf-8') except UnicodeDecodeError: # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # Switch to 'allow all' state. logger.warning("Failure while parsing robots.txt using %(parser)s." " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", {'parser': "RobotExclusionRulesParser"}, exc_info=sys.exc_info(), extra={'spider': self.spider}) robotstxt_body = '' self.rp.parse(robotstxt_body)
class RerpRobotParser(RobotParser): def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() robotstxt_body = decode_robotstxt(robotstxt_body, spider) self.rp.parse(robotstxt_body) @classmethod def from_crawler(cls, crawler, robotstxt_body): spider = None if not crawler else crawler.spider o = cls(robotstxt_body, spider) return o def allowed(self, url, user_agent): user_agent = to_unicode(user_agent) url = to_unicode(url) return self.rp.is_allowed(user_agent, url)
def get_robots(url): robots_directory = 'robots' robots_file_path = robots_directory+'/'+url if os.path.isfile(robots_file_path): robots_file = open(robots_file_path,"rb") # robots_parser = RobotExclusionRulesParser() # robots_parser.parse(content) robots_parser = pickle.load(robots_file) else: buffer = StringIO.StringIO() c = pycurl.Curl() c.setopt(c.URL, 'http://'+url+'/robots.txt') c.setopt(c.REFERER,'') c.setopt(c.USERAGENT,'Curl') c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.WRITEFUNCTION, buffer.write) try: c.perform() except pycurl.error, e: print "Error code: ", e[0] print "Error message: ", e[1] c.close() robots_parser = RobotExclusionRulesParser() robots_parser.parse('') return robots_parser c.close() # print buffer.getvalue() robots_parser = RobotExclusionRulesParser() robots_parser.parse(buffer.getvalue()) robots_file = open(robots_file_path,"wb") pickle.dump(robots_parser, robots_file)
def allowed_url(self): #FIXME: Should use the geturl address as it may have been redirected scheme, netloc, path, query, fragment = urlsplit(self.url) robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""]) #FIXME: Should cache robots.txt in a better persistent data structure if robot_url in ROBOT_CACHE: rp = ROBOT_CACHE[robot_url] else: rp = RobotExclusionRulesParser() try: rp.fetch(robot_url) # Currently if there's a problem we assume there is no robots.txt except IOError: # Should be catching the urllib2.URLError exception logging.debug("Couldn't retrieve robots.txt for %s" % robot_url) rp = None except UnicodeDecodeError: logging.debug("Unicode decode error for robots.txt at %s" % robot_url) rp = None except httplib.HTTPException: logging.debug("Generic HTTPException for robots.txt at %s" % robot_url) rp = None ROBOT_CACHE[robot_url] = rp if rp is None or rp.is_allowed("*", self.url): base_url = urlunsplit([scheme, netloc, "", "", ""]) # If there's a current delay on the site respect robots.txt and stall if self.db.exists(netloc): logging.debug("Obeying robot overlord for %s..." % netloc) URLHandler.add_to_busy(self.db, self.url) return False # Set a delay for any other requests to this site to respect robots.txt delay = rp.get_crawl_delay("*") if rp else None if delay: delay = int(math.ceil(float(rp.get_crawl_delay("*")))) else: delay = SETTINGS["DEFAULT_ROBOTS_DELAY"] self.db.setex(netloc, "1", delay) return True else: return False
def crawl_website(website): website.update_robots_txt() # only updates if necessary rules = RobotExclusionRulesParser() rules.parse(website.robots_content) # TODO add check for site last updated timestamp # Has the index been retrieved yet? if not website.webpage_set.exists(): # get index if rules.is_allowed('*', '/'): webpage = Webpage.objects.create( local_url='/', robots_allowed=True, website=website, ) crawl_existing_webpage(webpage, rules) else: # create a placeholder index webpage webpage = Webpage.objects.create( local_url='/', robots_allowed=False, website=website, ) print 'Robots not allowed to index root' return None # Are there webpages to be accessed? allowed_webpages = website.webpage_set.filter(robots_allowed=True) if not allowed_webpages.exists(): # print 'no allowed webpages found for {website}'.format(website=website.url) return None # Are there new links to try out? new_webpages = allowed_webpages.filter(exists=None) if new_webpages.exists(): # start with the oldest first # created and updated are the same for newly-created webpages webpage = new_webpages.order_by('created').first() print 'crawling new' return crawl_existing_webpage(webpage, rules) # Crawl an existing webpage if rules.is_allowed('*', '/foo.html'): webpage = allowed_webpages.filter( exists=True).order_by('updated').first() print 'crawling existing' return crawl_existing_webpage(webpage, rules)
def check(self, hostkey, relurl): """ Return True if allowed to fetch, False if not, None if we do not have robots.txt for this entry. """ robotstxt, expiration = self.robots.get(hostkey, (None, None)) if robotstxt is None: return None # FIXME: mtime? we need to let robots.txt expire. robotparser = RobotExclusionRulesParser() if robotsparser.is_expired(): return None robotparser.seturl(hostkey + '/robots.txt') robotparser.parse(robotstxt.splitlines()) return robotparser.can_fetch(hostkey + relurl)
def _parse_robots(self, response): rp = RobotExclusionRulesParser() rp.parse(response.body) self._parsers[urlparse_cached(response).netloc] = rp
def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() robotstxt_body = decode_robotstxt(robotstxt_body, spider) self.rp.parse(robotstxt_body)
#!/usr/bin/python #encoding:utf-8 from robotexclusionrulesparser import RobotExclusionRulesParser as RobotsParser rb = RobotsParser() # rb.fetch("http://www.zhihu.com/robots.txt") # print rb # print rb._RobotExclusionRulesParser__rulesets # print rb.is_allowed('*', 'http://www.zhihu.com/loginasdkj?encode=12') # print rb.is_allowed('*', '/admin_inbox') # print '======' rb.fetch("http://www.iplaypython.com/robots.txt") print rb print '======' rb.fetch("http://baidu.com/robots.txt") print rb print '======' rb.fetch("http://jaysonhwang.com/robots.txt") print rb print '======'
def benchmark_rerp_parser(website): from robotexclusionrulesparser import RobotExclusionRulesParser rp = RobotExclusionRulesParser() rp.parse(website['robotstxt']) for link in website['links']: rp.is_allowed('googlebot', link)
class CrawlerWorker( QObject ): # spider that will get links of website # called to create instance of the class finish = False def __init__(self, info, instance, parent=None): super(CrawlerWorker, self).__init__(parent) self._instance = instance self.running = True self.base_url = info['base_url'] # main url of website self._links_to_crawl = [] # list of links yet to open self.crawled_links = {} # dictionary of links opened/all links self.__parsed_crawled = {} # list of urls and their html pages self.total = 0 # total number of found links self.total_crawled = 0 # total number of valid crawled links in website self.max_pages = info['max_crawl'] # max pages to crawl self.invalid_links_count = 0 # number of broken links found self.invalid_links_list = [] # list of broken links found self.dynamic = [] self.info = info self.login_url = info['login_url'] # login page url if available if info['robo_url']: self._rb_parser = RobotExclusionRulesParser() self._rb_parser.fetch(info['robo_url']) self._user_agent = 'WASecBot' else: self._rb_parser = None self.browser = browser.RoboBrowser(parser="html.parser", user_agent="WASecBot") self.browser.session.verify = False self._logged_in = False self.running = True self._instance.btncrawlcancel.clicked.connect(self.pause) self._elapsed = 0 self.delay = 15 self._requests = 0 self.start = None urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def _opener(self, url): retry = 1 while True: try: self.browser.open(url=url) break except exceptions.ConnectionError as ce: # sleep(self.delay * retry) if retry == 11: return False else: retry += 1 return True def _compute_crawl_delay(self): self._requests += 1 if self._requests <= 10: self._elapsed += self.browser.response.elapsed.total_seconds() delay = self._elapsed / self._requests self.delay = delay * 200 if self.delay >= 180: self.delay = 15 else: self._requests = 1 self._elapsed = self.browser.response.elapsed.total_seconds() self.delay = self._elapsed * 200 def pause(self): self.running = False self._instance.change_state.emit('Canceling...') choice = QtWidgets.QMessageBox.question( self._instance, "Cancel Crawl!", "WASec is not finished yet, are You sure you want to stop crawling?", QtWidgets.QMessageBox.Cancel | QtWidgets.QMessageBox.Yes) if choice == QtWidgets.QMessageBox.Yes: self.finish = True self.running = False self._instance.crawl_finished.emit(self._wrap_up()) else: self.running = True # get total number of links opened so far def total_links(self): total = 0 for index in self.crawled_links: total += len(self.crawled_links[index]['url']) return total # check if max pages reached def _crawled_max(self): result = (self.max_pages == 0) or (self.max_pages > self.total_links()) return result # is link already listed def _is_link_listed(self, link): self._instance.change_state.emit('Check if URL is listed...') url = parse.urljoin(self.base_url, link) result = False for index in self.crawled_links: for opened in self.crawled_links[index]['url'].keys(): if url == opened or link == opened: result = True for to_open in self._links_to_crawl: if link == to_open[1] or url == to_open[1]: result = True return result # gets dynamic urls def _is_dynamic(self, url): self._instance.change_state.emit('Check if URL is dynamic...') if '?' in str(url) or '=' in str(url): self.dynamic.append(url) # check if page opened and exists def _is_response_ok(self, url): # status_code 200 means OK; no problems with page if 200 == self.browser.response.status_code: return True else: self._instance.change_state.emit('URL is invalid!') self.invalid_links_count += 1 self.invalid_links_list.append(url) return False def _is_html_page(self, url): try: if 'text/html' in self.browser.response.headers["content-type"]: return True else: self.invalid_links_count += 1 self.invalid_links_list.append(url) self._instance.change_state.emit('URL is invalid!') return False except KeyError: return True def _is_same_page(self, url): if self.browser.url != url: res = self._opener(url) else: res = True if res: page = self.browser.parsed for index in self.crawled_links: for link in self.crawled_links[index]['url'].keys(): check = self.__parsed_crawled[link] if check == page: self._instance.change_state.emit('URL is invalid!') return False return True else: self.finish = True self.running = False return False def _page_wise(self, url): if self.browser.url != url: res = self._opener(url) else: res = True if res: return self._is_response_ok(url) and self._is_html_page( url) and self._is_same_page(url) else: self.finish = True self.running = False return False def _is_same_query(self, page_link): parsed_url = parse.urlparse(page_link) query = parse.parse_qsl(parsed_url.query) query_len = len(query) if query_len > 0: for index in self.crawled_links: for link in self.crawled_links[index]['url'].keys(): parsed_link = parse.urlparse(link) link_query = parse.parse_qsl(parsed_link.query) if (parsed_link.path == parsed_url.path) and (len(link_query) == query_len): i = n = 0 while i < query_len: if query[i][0] == link_query[i][0]: n += 1 i += 1 if n == query_len: # result = self._is_same_page(page_link) # return result self._instance.change_state.emit('URL is invalid!') print("is same query") return False return True # check if given url belongs to website # i.e. is in the website's domain def _in_domain(self, url): if self.base_url in url: # result = 0 meaning url belongs to website return True else: self._instance.change_state.emit('URL is invalid!') self.invalid_links_count += 1 self.invalid_links_list.append(url) return False # check for url protocol def _check_protocol(self, url): parsed = parse.urlparse(url) # parse url to get information from it protocol = str.lower(str(parsed[0])) # get url protocol if protocol == "http" or protocol == "https": # is protocol 'http' or 'https' return True else: self.invalid_links_count += 1 self.invalid_links_list.append(url) self._instance.change_state.emit('URL is invalid!') return False def _is_robot_allowed(self, path): if self._rb_parser: return self._rb_parser.is_allowed(self._user_agent, path) else: return True def _url_wise(self, url): return self._in_domain(url) and self._check_protocol( url) and self._is_same_query(url) def _is_url_good(self, url): return self._url_wise(url) and self._page_wise(url) def _at_login(self, url): if not self.login_url or self.login_url != str(url): return False elif self.login_url == str(url): return True def _check_login(self, parsed): if self.info['logged_in']: self._instance.change_state.emit('Logging into the website...') handel = BeyondLogin(self.browser) self._logged_in = handel.get_login_info(self.info) parent = self._check_parent(handel.login_url) if self._logged_in: self._instance.change_state.emit('Login Successful!') # sleep(2) if parent: self._add_crawled(handel.login_url, parent, parsed) else: self._add_crawled(handel.login_url, self.base_url, parsed) else: self._instance.change_state.emit('Login Failed!') self._links_to_crawl.append( [handel.login_url, handel.redirect_url]) else: self._instance.change_state.emit('Login Successful!') self._logged_in = True def _check_parent(self, url): for child in self._links_to_crawl: if child[1] == url: return child[0] return None # get all links in a given page def _get_page_links(self, url, page): self._instance.change_state.emit('Searching for all links in page...') # gets a list of all <a> tags in page links_tags = page.find_all("a") # going through each link for link in links_tags: self._instance.change_state.emit( 'Searching for all links in page...') link_href = link.get( "href" ) # get <a> tag link reference. example: <a href="page.html"> ==> page.html # check that: link isn't already listed + link isn't blank link_listed = self._is_link_listed(link_href) if (not link_listed) and ('#' not in str(link_href)): # add link to list of links to open self._links_to_crawl.append([url, link_href]) print("_get_page_links") print(url, link_href) self.total += 1 forms = page.find_all("form") for form in forms: action = form.get("action") if action: # link isn't blank # check that: link isn't already listed + link_listed = self._is_link_listed(action) if (not link_listed) and (action != "#"): # add link to list of links to open self._links_to_crawl.append([url, action]) self.total += 1 self._instance.show_total.emit(self.total) image_map = page.find_all('area') for area in image_map: href = area.get( 'href' ) # get 'href' attribute from <area shape="rect" href="#main"> tag listed = self._is_link_listed(href) if (not listed) and ('#' not in href): # add link to list of links to open self._links_to_crawl.append([url, href]) self.total += 1 self._instance.show_total.emit(self.total) # open a page and get its content def _open_url(self, url): if self.running: self._instance.change_state.emit('Pausing between requests...') # get page content parsed = self.browser.parsed if self.info['max_crawl'] != 1: self._get_page_links( url, parsed) # send content to retrieve links from # sleep(self.delay) else: self._add_crawled(url, url, parsed) self._is_dynamic(url) self._instance.show_total.emit(self.total_crawled) if self._at_login(url) and not self._logged_in: self._check_login(parsed) return parsed def _add_crawled(self, url, parent, parsed_page): self._instance.change_state.emit('Adding new crawled link...') found = False try: title = parsed_page.find('title') if not title: title = 'NO-TITLE' else: title = title.text except: title = 'NO-TITLE' for index in self.crawled_links: if self.crawled_links[index]['from'] == parent: self.crawled_links[index]['url'][url] = title found = True break if not found: self.crawled_links[self.total_crawled] = { 'from': parent, 'url': { url: title } } self.total_crawled += 1 self.__parsed_crawled[url] = parsed_page self._instance.on_info.emit(self.crawled_links) # sleep(2) # main spider function; creates our spider's web def run(self): self.start = datetime.now().time() self._opener(self.base_url) self._open_url(self.base_url) # send main url to be opened and checked self._elapsed = self.browser.state.response.elapsed.total_seconds() self._compute_crawl_delay() # while there are still links to open self.i = len(self._links_to_crawl) - 1 while (len(self._links_to_crawl)) > 0 and ( self._crawled_max()) and not self.finish: self._instance.change_state.emit('Crawling...') # start from the last link in the list parent = self._links_to_crawl[self.i][0] link = self._links_to_crawl[self.i][1] print("----") print(parent, link) if parent[len(parent) - 1] != '/': parent = parent + '/' # url = parse.urljoin(self.base_url, link) # join main url with page link url = parse.urljoin(parent, link) # join main url with page link self._opener(url) if 200 != self.browser.response.status_code: url = parse.urljoin(self.base_url, link) # join main url with page link print(url) if self._is_url_good(url) and self._is_robot_allowed( link): # is url valid and working print("good") self._instance.change_state.emit('URL is good!') parsed_page = self._open_url(url) # open page self._add_crawled(url, parent, parsed_page) self._compute_crawl_delay() # add link to list of opened links self._is_dynamic(url) else: print("not good") self._instance.change_state.emit('URL is not good!') # delete opened link from list of links to open self._links_to_crawl.pop(self.i) if self.i > 0: self.i = self.i - 1 elif self.i == 0: self.i = len(self._links_to_crawl) - 1 if len(self._links_to_crawl) == 0 or self.i < 0: self._instance.change_state.emit('Finished.') self.finish = True break self.finish = True self._instance.crawl_finished.emit(self._wrap_up()) def _calc_time(self): finish = datetime.now().time() delta1 = timedelta(seconds=self.start.second, microseconds=self.start.microsecond, minutes=self.start.minute, hours=self.start.hour) delta2 = timedelta(seconds=finish.second, microseconds=finish.microsecond, minutes=finish.minute, hours=finish.hour) taken = delta2 - delta1 seconds = round(taken.total_seconds()) if seconds >= 3600: hours = round(seconds / 3600) minutes = (round((seconds / 3600) / 60)) elapsed = str(hours) + ':' + str(minutes) + ' hrs' elif seconds >= 60: minutes = round(seconds / 60) seconds = round(seconds % 60) elapsed = str(str(minutes) + '.' + str(seconds) + ' mins') else: elapsed = str(seconds) + ' secs' return elapsed def _wrap_up(self): wrap = { 'links': self.crawled_links, 'dynamic': self.dynamic, 'total_crawled': self.total_links(), 'total': self.total, 'invalid': self.invalid_links_count, 'running': self.running, 'time': self._calc_time() } return wrap
def __init__(self, url): self.url = Url(urljoin(url, '/robots.txt')) self.rerp = RobotExclusionRulesParser() self.rerp.user_agent = 'Mozilla/5.0' self.rerp.fetch(self.url.url())
class Crawler: start_page_url = '' rerp = RobotExclusionRulesParser() cFiles = CrawlerFiles() tld = '' waiting_url_set = set() crawled_url_set = set() bad_url_set = set() find_string_set = set() find_flname_set = set() found_flname_set = set() found_string_set = set() stop_request = False download_chunk_size = 0 conn_timeout = 0 delay = 0 user_agent = '' bad_url_prefix = '->Bad Url ' found_string_prefix = '->Found ' found_flname_prefix = '->Saved ' def __init__(self, save_dir, start_url, find_flname_set, find_string_set, chunk_size, conn_timeout, default_delay, user_agent): logger.info('->Starting RERP') Crawler.rerp.fetch(start_url + '/robots.txt') Crawler.user_agent = user_agent delay = Crawler.rerp.get_crawl_delay(Crawler.user_agent) Crawler.conn_timeout = conn_timeout if delay is None: Crawler.delay = default_delay else: Crawler.delay = delay Crawler.cFiles = CrawlerFiles(save_dir, start_url) logger.info('->Getting Previous Session files (if any) ') Crawler.crawled_url_set = Crawler.cFiles.get_file_data( Crawler.cFiles.crawled_file) Crawler.found_flname_set = Crawler.cFiles.get_file_data( Crawler.cFiles.found_files_file) Crawler.found_string_set = Crawler.cFiles.get_file_data( Crawler.cFiles.found_strings_file) Crawler.bad_url_set = Crawler.cFiles.get_file_data( Crawler.cFiles.invalid_file) Crawler.waiting_url_set = Crawler.cFiles.get_file_data( Crawler.cFiles.waiting_file) info = Crawler.cFiles.get_file_data(Crawler.cFiles.info_file) Crawler.start_page_url = start_url Crawler.tld = url_func.return_tld(start_url) Crawler.find_string_set = find_string_set Crawler.find_flname_set = find_flname_set Crawler.download_chunk_size = chunk_size logger.info('Crawler Initiated') logger.info('->Loading Website Info') logger.debug('* ' * 20 + 'Website Info' + '* ' * 20) if info is None: info = url_func.get_domain_info(Crawler.tld) Crawler.cFiles.set_file_data(Crawler.cFiles.info_file, info) for key in info: val = info[key] if val: logger.debug("%-20s : %s" % (str(key).upper(), str(val))) logger.debug('* ' * 40) @staticmethod def crawl_page(t_name, page_url): # noinspection PyBroadException try: logger.debug("%s - %s" % (t_name, page_url)) if not Crawler.rerp.is_allowed(Crawler.user_agent, page_url): logger.debug('->%s not allowed to crawl %s' % (t_name, page_url)) return Crawler.add_urls(page_url) if not Crawler.stop_request: Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) time.sleep(Crawler.delay) except requests.HTTPError as h: string = "HTTP Error %d - %s" % (h.response.status_code, page_url) logger.debug(Crawler.bad_url_prefix + string) Crawler.bad_url_set.add(string) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) except requests.ReadTimeout: string = "Timeout %0.1f secs - %s " % (Crawler.conn_timeout, page_url) logger.debug(Crawler.bad_url_prefix + string) Crawler.bad_url_set.add(string) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) except requests.TooManyRedirects as t: string = "%s - %s" % (t, page_url) logger.debug(Crawler.bad_url_prefix + string) Crawler.bad_url_set.add(string) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) except (requests.ConnectionError, requests.ConnectTimeout): if url_func.check_connection() != url_func.CONNECTION_OK: Crawler.wait(t_name) except Exception: logger.exception('Exception in %s ' % page_url) Crawler.waiting_url_set.remove(page_url) Crawler.crawled_url_set.add(page_url) @staticmethod def add_urls(page_url): not_html = False with closing( requests.get(page_url, stream=True, timeout=Crawler.conn_timeout) ) as page: # html code of page type_of_page = page.headers[ 'Content-Type'] # get content type from header of html page page.raise_for_status() if 'html' in type_of_page: # web page soup = BeautifulSoup( page.content, "html.parser") # parse the content of page text = soup.text for string in Crawler.find_string_set: if Crawler.stop_request: # if stop is requested by user return str_url = string + ' ' + page_url if text is not None and string in text: Crawler.found_string_set.add(str_url) logger.debug( '%s %s %s' % (Crawler.found_string_prefix, string, page_url)) for a_tag_content in soup.find_all('a'): if Crawler.stop_request: # if stop is requested by user return url = parse.urljoin(Crawler.start_page_url, a_tag_content.get('href')) if '#' in url: url = url.split('#')[0] if ' ' in url: url = url.replace(' ', '%20') if url_func.return_tld(url) == Crawler.tld: if url not in Crawler.crawled_url_set: Crawler.waiting_url_set.add(url) else: not_html = True if not_html: f_name = page_url.split('/')[-1] download_file = False for string in Crawler.find_flname_set: if Crawler.stop_request: break if string in f_name: download_file = True break if download_file: type_split = type_of_page.split('/') f_dir = Crawler.cFiles.save_dir + '/' + type_split[0] if not dir_exists(f_dir): make_dir(f_dir) Crawler.found_flname_set.add(page_url) Crawler.file_download(page_url, f_dir, f_name) if not Crawler.stop_request: logger.debug('%s %s' % (Crawler.found_flname_prefix, page_url)) # wait @staticmethod def wait(t_name): logger.info('->%s waiting for connection...' % t_name) while True: if Crawler.stop_request: break if url_func.check_connection() == url_func.CONNECTION_OK: break time.sleep(2) @staticmethod def update_files(): logger.info('Updating Files') Crawler.cFiles.set_file_data(Crawler.cFiles.crawled_file, Crawler.crawled_url_set) Crawler.cFiles.set_file_data(Crawler.cFiles.found_files_file, Crawler.found_flname_set) Crawler.cFiles.set_file_data(Crawler.cFiles.found_strings_file, Crawler.found_string_set) Crawler.cFiles.set_file_data(Crawler.cFiles.invalid_file, Crawler.bad_url_set) Crawler.cFiles.set_file_data(Crawler.cFiles.waiting_file, Crawler.waiting_url_set) @staticmethod def file_download(file_url, f_dir, f_name): f_path = get_file_path(f_dir, f_name) # logger.info('Saving ', f_name) dl = file_size(f_path) resume_header = {'Range': 'bytes=%d-' % dl} with closing( requests.get(file_url, stream=True, headers=resume_header, timeout=Crawler.conn_timeout)) as file: tl_str = file.headers.get('content-length') # if there is no content length specified in header website doesnt support resuming mode = 'ab' if tl_str else 'wb' with open(f_path, mode) as handle: for chunk in file.iter_content( chunk_size=Crawler.download_chunk_size): if Crawler.stop_request: # if stop is requested by user return if chunk: handle.write(chunk)
def manual_add_robot_policies(): # coz some critical sites have invalid robots.txt ## surprised to see SO MANY sites without valid robots.txt! site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /search\n' + 'Disallow: /advanced_search\n') robots_policies['findingaids.library.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n' + 'Disallow: /contact\n' + 'Disallow: /downloads\n' + 'Disallow: /users\n') robots_policies['digitalhub.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n') robots_policies['images.library.northwestern.edu'] = site_rp robots_policies['images.northwestern.edu'] = site_rp robots_policies['media.northwestern.edu'] = site_rp robots_policies['arch.library.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() site_rp.parse('User-agent: * \n' + 'Disallow: /?*\n') robots_policies['schedule.radiology.northwestern.edu'] = site_rp site_rp = RobotExclusionRulesParser() try: request = urllib2.Request('http://www.ctd.northwestern.edu/robots.txt') response = urllib2.urlopen(request, timeout=5) content = response.read() except: content = 'User-agent: * \n' content += ('Disallow: /courses?*\n') site_rp.parse(content) robots_policies['www.ctd.northwestern.edu'] = site_rp
def crawl(self, in_url): global global_id, last_update, DOMAIN print("Crawler %d on P#%d: %s" % (self.id, url_ids[in_url], in_url)) try: request = urllib2.Request(in_url) response = urllib2.urlopen(request, timeout=5) real_url = w3lib.url.canonicalize_url(response.geturl()) real_uri = urlparse(real_url) extension = real_uri.path.lower().split('.')[-1] if response.info( ).maintype != 'text' or extension in skip_file_types: content = '' else: content = response.read() except: real_url = in_url content = '' if real_url == in_url: # no redirect soup = BeautifulSoup(content, "html.parser") raw_urls = [link.get('href') for link in soup.find_all('a')] else: # redirect raw_urls = [real_url] out_urls = set() for url in raw_urls: #print('parsing', url) if url is None or len(url) <= 1: continue url = url.strip() if url.startswith('/http://') or url.startswith('/https://'): # why would someone do this? url = url[1:] if url.startswith('mailto:') or url.startswith('mailto@'): continue fixed_url = w3lib.url.canonicalize_url(urljoin(in_url, url)) if len(fixed_url) > 1000: # long urls tend to be wrong urls continue uri = urlparse(fixed_url) if uri.scheme is not None and uri.scheme not in [ 'http', 'https', '' ]: continue if uri.hostname is not None: if not uri.hostname.endswith(DOMAIN): continue elif uri.hostname not in robots_policies: site_rp = RobotExclusionRulesParser() try: site_rp.fetch('http://' + uri.hostname + '/robots.txt', timeout=3) except: print "error with", ('http://' + uri.hostname + '/robots.txt') rp_lock.acquire() robots_policies[uri.hostname] = site_rp rp_lock.release() if not (robots_policies[uri.hostname].is_allowed( "*", fixed_url)): continue extension = uri.path.lower().split('.')[-1] if extension in skip_file_types: continue if 1 < len(extension) < 8 and '/' not in extension: urls_extensions.add(extension) out_urls.add(fixed_url) #print out_urls #get lock write_lock.acquire() out_ids = [] for url in out_urls: if url in url_ids: out_ids.append(url_ids[url]) else: url_ids[url] = global_id out_ids.append(global_id) url_id_file.write('%d\t%s\n' % (global_id, url)) url_id_file.flush() global_id += 1 url_tasks.put(url) transition_file.write('%d\t%s\n' % (url_ids[in_url], str(out_ids))) transition_file.flush() last_update = time.time() write_lock.release() #release lock print('%d urls in total reported by %d' % (global_id, self.id))