def __get_robot_handler(url): rp = RobotExclusionRulesParser() if Util.is_url(url): # get the original base url base_url = Util.get_base_url(url) page = requests.get(urljoin(base_url, 'robots.txt')) rp.fetch(urljoin(base_url, 'robots.txt')) return rp
def __init__(self, url, robots_fetch_timeout, user_agent, logger): self._logger = logger split_url = urlparse(url) split_list = list(split_url) split_list[2] = ROBOTS_FILE #The path at index robots_txt_url = str(urlunparse(tuple(split_list))) robots_filter = RobotExclusionRulesParser() logger.debug("Fetching robots filter from path: %s"%robots_txt_url) robots_filter.fetch(robots_txt_url, robots_fetch_timeout) self._robots_filter = robots_filter self._ua = user_agent
class Robot: def __init__(self, url): self.url = Url(urljoin(url, '/robots.txt')) self.rerp = RobotExclusionRulesParser() self.rerp.user_agent = 'Mozilla/5.0' self.rerp.fetch(self.url.url()) def throttle_time(self): return self.rerp.get_crawl_delay('Mozilla/5.0') def should_block(self, url): return not self.rerp.is_allowed('Mozilla/5.0', url.url())
def load_robot_rules(): """ load rules from the robots.txt if the online online version is not accessible, then the local version is loaded from disk """ rerp = RobotExclusionRulesParser() try: rerp.fetch(urlparse.urljoin(BASE_URL, '/robots.txt')) except: rerp.parse(open('robots.txt', 'r').read()) return rerp
def allowed_url(self): #FIXME: Should use the geturl address as it may have been redirected scheme, netloc, path, query, fragment = urlsplit(self.url) robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""]) #FIXME: Should cache robots.txt in a better persistent data structure if robot_url in ROBOT_CACHE: rp = ROBOT_CACHE[robot_url] else: rp = RobotExclusionRulesParser() try: rp.fetch(robot_url) # Currently if there's a problem we assume there is no robots.txt except IOError: # Should be catching the urllib2.URLError exception logging.debug("Couldn't retrieve robots.txt for %s" % robot_url) rp = None except UnicodeDecodeError: logging.debug("Unicode decode error for robots.txt at %s" % robot_url) rp = None except httplib.HTTPException: logging.debug("Generic HTTPException for robots.txt at %s" % robot_url) rp = None ROBOT_CACHE[robot_url] = rp if rp is None or rp.is_allowed("*", self.url): base_url = urlunsplit([scheme, netloc, "", "", ""]) # If there's a current delay on the site respect robots.txt and stall if self.db.exists(netloc): logging.debug("Obeying robot overlord for %s..." % netloc) URLHandler.add_to_busy(self.db, self.url) return False # Set a delay for any other requests to this site to respect robots.txt delay = rp.get_crawl_delay("*") if rp else None if delay: delay = int(math.ceil(float(rp.get_crawl_delay("*")))) else: delay = SETTINGS["DEFAULT_ROBOTS_DELAY"] self.db.setex(netloc, "1", delay) return True else: return False
def is_url_allowed(url): """ Returns ``True`` if robots.txt rules for given URL allow fetching it. This function parses the robots rules for given URL (if any) and returns a boolean flag that tells you whether fetching it is allowed. Note that it doesn't test whether the URL exists on the host. :param url: URL to test :returns: ``True`` if URL can be fetched, ``False`` otherwise """ robots = RobotParser() robots.user_agent = UA_STRING robots.fetch(get_robots_url(url)) if robots.response_code != 200: return True return robots.is_allowed(UA_STRING, url)
#!/usr/bin/python #encoding:utf-8 from robotexclusionrulesparser import RobotExclusionRulesParser as RobotsParser rb = RobotsParser() # rb.fetch("http://www.zhihu.com/robots.txt") # print rb # print rb._RobotExclusionRulesParser__rulesets # print rb.is_allowed('*', 'http://www.zhihu.com/loginasdkj?encode=12') # print rb.is_allowed('*', '/admin_inbox') # print '======' rb.fetch("http://www.iplaypython.com/robots.txt") print rb print '======' rb.fetch("http://baidu.com/robots.txt") print rb print '======' rb.fetch("http://jaysonhwang.com/robots.txt") print rb print '======'
def crawl(self, in_url): global global_id, last_update, DOMAIN print("Crawler %d on P#%d: %s" % (self.id, url_ids[in_url], in_url)) try: request = urllib2.Request(in_url) response = urllib2.urlopen(request, timeout=5) real_url = w3lib.url.canonicalize_url(response.geturl()) real_uri = urlparse(real_url) extension = real_uri.path.lower().split('.')[-1] if response.info( ).maintype != 'text' or extension in skip_file_types: content = '' else: content = response.read() except: real_url = in_url content = '' if real_url == in_url: # no redirect soup = BeautifulSoup(content, "html.parser") raw_urls = [link.get('href') for link in soup.find_all('a')] else: # redirect raw_urls = [real_url] out_urls = set() for url in raw_urls: #print('parsing', url) if url is None or len(url) <= 1: continue url = url.strip() if url.startswith('/http://') or url.startswith('/https://'): # why would someone do this? url = url[1:] if url.startswith('mailto:') or url.startswith('mailto@'): continue fixed_url = w3lib.url.canonicalize_url(urljoin(in_url, url)) if len(fixed_url) > 1000: # long urls tend to be wrong urls continue uri = urlparse(fixed_url) if uri.scheme is not None and uri.scheme not in [ 'http', 'https', '' ]: continue if uri.hostname is not None: if not uri.hostname.endswith(DOMAIN): continue elif uri.hostname not in robots_policies: site_rp = RobotExclusionRulesParser() try: site_rp.fetch('http://' + uri.hostname + '/robots.txt', timeout=3) except: print "error with", ('http://' + uri.hostname + '/robots.txt') rp_lock.acquire() robots_policies[uri.hostname] = site_rp rp_lock.release() if not (robots_policies[uri.hostname].is_allowed( "*", fixed_url)): continue extension = uri.path.lower().split('.')[-1] if extension in skip_file_types: continue if 1 < len(extension) < 8 and '/' not in extension: urls_extensions.add(extension) out_urls.add(fixed_url) #print out_urls #get lock write_lock.acquire() out_ids = [] for url in out_urls: if url in url_ids: out_ids.append(url_ids[url]) else: url_ids[url] = global_id out_ids.append(global_id) url_id_file.write('%d\t%s\n' % (global_id, url)) url_id_file.flush() global_id += 1 url_tasks.put(url) transition_file.write('%d\t%s\n' % (url_ids[in_url], str(out_ids))) transition_file.flush() last_update = time.time() write_lock.release() #release lock print('%d urls in total reported by %d' % (global_id, self.id))
class CrawlerWorker( QObject ): # spider that will get links of website # called to create instance of the class finish = False def __init__(self, info, instance, parent=None): super(CrawlerWorker, self).__init__(parent) self._instance = instance self.running = True self.base_url = info['base_url'] # main url of website self._links_to_crawl = [] # list of links yet to open self.crawled_links = {} # dictionary of links opened/all links self.__parsed_crawled = {} # list of urls and their html pages self.total = 0 # total number of found links self.total_crawled = 0 # total number of valid crawled links in website self.max_pages = info['max_crawl'] # max pages to crawl self.invalid_links_count = 0 # number of broken links found self.invalid_links_list = [] # list of broken links found self.dynamic = [] self.info = info self.login_url = info['login_url'] # login page url if available if info['robo_url']: self._rb_parser = RobotExclusionRulesParser() self._rb_parser.fetch(info['robo_url']) self._user_agent = 'WASecBot' else: self._rb_parser = None self.browser = browser.RoboBrowser(parser="html.parser", user_agent="WASecBot") self.browser.session.verify = False self._logged_in = False self.running = True self._instance.btncrawlcancel.clicked.connect(self.pause) self._elapsed = 0 self.delay = 15 self._requests = 0 self.start = None urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def _opener(self, url): retry = 1 while True: try: self.browser.open(url=url) break except exceptions.ConnectionError as ce: # sleep(self.delay * retry) if retry == 11: return False else: retry += 1 return True def _compute_crawl_delay(self): self._requests += 1 if self._requests <= 10: self._elapsed += self.browser.response.elapsed.total_seconds() delay = self._elapsed / self._requests self.delay = delay * 200 if self.delay >= 180: self.delay = 15 else: self._requests = 1 self._elapsed = self.browser.response.elapsed.total_seconds() self.delay = self._elapsed * 200 def pause(self): self.running = False self._instance.change_state.emit('Canceling...') choice = QtWidgets.QMessageBox.question( self._instance, "Cancel Crawl!", "WASec is not finished yet, are You sure you want to stop crawling?", QtWidgets.QMessageBox.Cancel | QtWidgets.QMessageBox.Yes) if choice == QtWidgets.QMessageBox.Yes: self.finish = True self.running = False self._instance.crawl_finished.emit(self._wrap_up()) else: self.running = True # get total number of links opened so far def total_links(self): total = 0 for index in self.crawled_links: total += len(self.crawled_links[index]['url']) return total # check if max pages reached def _crawled_max(self): result = (self.max_pages == 0) or (self.max_pages > self.total_links()) return result # is link already listed def _is_link_listed(self, link): self._instance.change_state.emit('Check if URL is listed...') url = parse.urljoin(self.base_url, link) result = False for index in self.crawled_links: for opened in self.crawled_links[index]['url'].keys(): if url == opened or link == opened: result = True for to_open in self._links_to_crawl: if link == to_open[1] or url == to_open[1]: result = True return result # gets dynamic urls def _is_dynamic(self, url): self._instance.change_state.emit('Check if URL is dynamic...') if '?' in str(url) or '=' in str(url): self.dynamic.append(url) # check if page opened and exists def _is_response_ok(self, url): # status_code 200 means OK; no problems with page if 200 == self.browser.response.status_code: return True else: self._instance.change_state.emit('URL is invalid!') self.invalid_links_count += 1 self.invalid_links_list.append(url) return False def _is_html_page(self, url): try: if 'text/html' in self.browser.response.headers["content-type"]: return True else: self.invalid_links_count += 1 self.invalid_links_list.append(url) self._instance.change_state.emit('URL is invalid!') return False except KeyError: return True def _is_same_page(self, url): if self.browser.url != url: res = self._opener(url) else: res = True if res: page = self.browser.parsed for index in self.crawled_links: for link in self.crawled_links[index]['url'].keys(): check = self.__parsed_crawled[link] if check == page: self._instance.change_state.emit('URL is invalid!') return False return True else: self.finish = True self.running = False return False def _page_wise(self, url): if self.browser.url != url: res = self._opener(url) else: res = True if res: return self._is_response_ok(url) and self._is_html_page( url) and self._is_same_page(url) else: self.finish = True self.running = False return False def _is_same_query(self, page_link): parsed_url = parse.urlparse(page_link) query = parse.parse_qsl(parsed_url.query) query_len = len(query) if query_len > 0: for index in self.crawled_links: for link in self.crawled_links[index]['url'].keys(): parsed_link = parse.urlparse(link) link_query = parse.parse_qsl(parsed_link.query) if (parsed_link.path == parsed_url.path) and (len(link_query) == query_len): i = n = 0 while i < query_len: if query[i][0] == link_query[i][0]: n += 1 i += 1 if n == query_len: # result = self._is_same_page(page_link) # return result self._instance.change_state.emit('URL is invalid!') print("is same query") return False return True # check if given url belongs to website # i.e. is in the website's domain def _in_domain(self, url): if self.base_url in url: # result = 0 meaning url belongs to website return True else: self._instance.change_state.emit('URL is invalid!') self.invalid_links_count += 1 self.invalid_links_list.append(url) return False # check for url protocol def _check_protocol(self, url): parsed = parse.urlparse(url) # parse url to get information from it protocol = str.lower(str(parsed[0])) # get url protocol if protocol == "http" or protocol == "https": # is protocol 'http' or 'https' return True else: self.invalid_links_count += 1 self.invalid_links_list.append(url) self._instance.change_state.emit('URL is invalid!') return False def _is_robot_allowed(self, path): if self._rb_parser: return self._rb_parser.is_allowed(self._user_agent, path) else: return True def _url_wise(self, url): return self._in_domain(url) and self._check_protocol( url) and self._is_same_query(url) def _is_url_good(self, url): return self._url_wise(url) and self._page_wise(url) def _at_login(self, url): if not self.login_url or self.login_url != str(url): return False elif self.login_url == str(url): return True def _check_login(self, parsed): if self.info['logged_in']: self._instance.change_state.emit('Logging into the website...') handel = BeyondLogin(self.browser) self._logged_in = handel.get_login_info(self.info) parent = self._check_parent(handel.login_url) if self._logged_in: self._instance.change_state.emit('Login Successful!') # sleep(2) if parent: self._add_crawled(handel.login_url, parent, parsed) else: self._add_crawled(handel.login_url, self.base_url, parsed) else: self._instance.change_state.emit('Login Failed!') self._links_to_crawl.append( [handel.login_url, handel.redirect_url]) else: self._instance.change_state.emit('Login Successful!') self._logged_in = True def _check_parent(self, url): for child in self._links_to_crawl: if child[1] == url: return child[0] return None # get all links in a given page def _get_page_links(self, url, page): self._instance.change_state.emit('Searching for all links in page...') # gets a list of all <a> tags in page links_tags = page.find_all("a") # going through each link for link in links_tags: self._instance.change_state.emit( 'Searching for all links in page...') link_href = link.get( "href" ) # get <a> tag link reference. example: <a href="page.html"> ==> page.html # check that: link isn't already listed + link isn't blank link_listed = self._is_link_listed(link_href) if (not link_listed) and ('#' not in str(link_href)): # add link to list of links to open self._links_to_crawl.append([url, link_href]) print("_get_page_links") print(url, link_href) self.total += 1 forms = page.find_all("form") for form in forms: action = form.get("action") if action: # link isn't blank # check that: link isn't already listed + link_listed = self._is_link_listed(action) if (not link_listed) and (action != "#"): # add link to list of links to open self._links_to_crawl.append([url, action]) self.total += 1 self._instance.show_total.emit(self.total) image_map = page.find_all('area') for area in image_map: href = area.get( 'href' ) # get 'href' attribute from <area shape="rect" href="#main"> tag listed = self._is_link_listed(href) if (not listed) and ('#' not in href): # add link to list of links to open self._links_to_crawl.append([url, href]) self.total += 1 self._instance.show_total.emit(self.total) # open a page and get its content def _open_url(self, url): if self.running: self._instance.change_state.emit('Pausing between requests...') # get page content parsed = self.browser.parsed if self.info['max_crawl'] != 1: self._get_page_links( url, parsed) # send content to retrieve links from # sleep(self.delay) else: self._add_crawled(url, url, parsed) self._is_dynamic(url) self._instance.show_total.emit(self.total_crawled) if self._at_login(url) and not self._logged_in: self._check_login(parsed) return parsed def _add_crawled(self, url, parent, parsed_page): self._instance.change_state.emit('Adding new crawled link...') found = False try: title = parsed_page.find('title') if not title: title = 'NO-TITLE' else: title = title.text except: title = 'NO-TITLE' for index in self.crawled_links: if self.crawled_links[index]['from'] == parent: self.crawled_links[index]['url'][url] = title found = True break if not found: self.crawled_links[self.total_crawled] = { 'from': parent, 'url': { url: title } } self.total_crawled += 1 self.__parsed_crawled[url] = parsed_page self._instance.on_info.emit(self.crawled_links) # sleep(2) # main spider function; creates our spider's web def run(self): self.start = datetime.now().time() self._opener(self.base_url) self._open_url(self.base_url) # send main url to be opened and checked self._elapsed = self.browser.state.response.elapsed.total_seconds() self._compute_crawl_delay() # while there are still links to open self.i = len(self._links_to_crawl) - 1 while (len(self._links_to_crawl)) > 0 and ( self._crawled_max()) and not self.finish: self._instance.change_state.emit('Crawling...') # start from the last link in the list parent = self._links_to_crawl[self.i][0] link = self._links_to_crawl[self.i][1] print("----") print(parent, link) if parent[len(parent) - 1] != '/': parent = parent + '/' # url = parse.urljoin(self.base_url, link) # join main url with page link url = parse.urljoin(parent, link) # join main url with page link self._opener(url) if 200 != self.browser.response.status_code: url = parse.urljoin(self.base_url, link) # join main url with page link print(url) if self._is_url_good(url) and self._is_robot_allowed( link): # is url valid and working print("good") self._instance.change_state.emit('URL is good!') parsed_page = self._open_url(url) # open page self._add_crawled(url, parent, parsed_page) self._compute_crawl_delay() # add link to list of opened links self._is_dynamic(url) else: print("not good") self._instance.change_state.emit('URL is not good!') # delete opened link from list of links to open self._links_to_crawl.pop(self.i) if self.i > 0: self.i = self.i - 1 elif self.i == 0: self.i = len(self._links_to_crawl) - 1 if len(self._links_to_crawl) == 0 or self.i < 0: self._instance.change_state.emit('Finished.') self.finish = True break self.finish = True self._instance.crawl_finished.emit(self._wrap_up()) def _calc_time(self): finish = datetime.now().time() delta1 = timedelta(seconds=self.start.second, microseconds=self.start.microsecond, minutes=self.start.minute, hours=self.start.hour) delta2 = timedelta(seconds=finish.second, microseconds=finish.microsecond, minutes=finish.minute, hours=finish.hour) taken = delta2 - delta1 seconds = round(taken.total_seconds()) if seconds >= 3600: hours = round(seconds / 3600) minutes = (round((seconds / 3600) / 60)) elapsed = str(hours) + ':' + str(minutes) + ' hrs' elif seconds >= 60: minutes = round(seconds / 60) seconds = round(seconds % 60) elapsed = str(str(minutes) + '.' + str(seconds) + ' mins') else: elapsed = str(seconds) + ' secs' return elapsed def _wrap_up(self): wrap = { 'links': self.crawled_links, 'dynamic': self.dynamic, 'total_crawled': self.total_links(), 'total': self.total, 'invalid': self.invalid_links_count, 'running': self.running, 'time': self._calc_time() } return wrap