def checkRobots(self): if self.domain[len(self.domain)-1] != "/": self.domain += "/" request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) self.rp = RobotFileParser() self.rp.set_url(self.domain+"robots.txt") self.rp.read()
def _get_request_delay( host: str, url: str, robots_parser: RobotFileParser, delay_mapping: Dict[str, Union[int, float]], default_delay: Union[int, float], ) -> Union[int, float]: pass crawl_delay = robots_parser.crawl_delay('*') if crawl_delay is not None: delay_mapping[host] = crawl_delay logger.debug( 'returning crawl delay value "%s" from robots.txt for url %s', crawl_delay, url) return crawl_delay request_rate = robots_parser.request_rate('*') if request_rate is not None: request_delay = request_rate.seconds / request_rate.requests delay_mapping[host] = request_delay logger.debug( 'computing value "%s" from request delay info (%s/%s) from robots.txt for url %s', request_delay, request_rate.requests, request_rate.seconds, url, ) return request_delay delay_mapping[host] = default_delay logger.debug('returning default delay value "%s" for url %s', default_delay, url) return default_delay
def is_allowed(self): if self.site.robots_status==200: parser = RobotFileParser() lines = io.StringIO(self.site.robots).readlines() parser.parse(lines) return parser.can_fetch(settings.USER_AGENT, self.get_url()) return True
def analyze_badbots(db, filename, max_entries=100000, max_bots=100): useragents = defaultdict(int) for i, entry in enumerate(db): parser = RobotFileParser(entry.url) parser.parse(entry.body.split("\n")) bans = [e for e in parser.entries if len(e.rulelines) == 1 and not e.rulelines[0].allowance and e.rulelines[0].path == '/'] for ban in bans: for useragent in ban.useragents: useragents[useragent] += 1 if i >= max_entries: break useragents = sorted(useragents.items(), key=lambda x: -x[1]) with open(filename, "w") as output: output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n") for useragent, count in useragents[:max_bots]: agenttype, info, company, homepage = BOT_TYPES.get(useragent, ('', '', '', '')) if not info: info = DEFAULT_USERAGENT_URL % useragent output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (useragent, count, agenttype, info, company, homepage)) return useragents
def get_robot_file_parser(start_url: str, **kwargs) -> Union[RobotFileParser, None]: """Returns :class:`~python:urllib.robotparser.RobotFileParser` object from given URL. If no ``robots.txt`` file is found or error occurs, returns ``None``. :param start_url: URL from which ``robots.txt`` will be collected. :param kwargs: Will be passed to :func:`get_html`. .. seealso:: :func:`async_get_robot_file_parser` """ try: parsed_url = ParsedUrl(start_url) robot_txt_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" rp = RobotFileParser(robot_txt_url) text = get_html(robot_txt_url, check_http_content_type=False, return_response_object=False, raise_for_status=True, **kwargs) lines = [line.strip() for line in text.split("\n") if line != ''] rp.parse(lines) return rp except Exception as e: # Exceptions from URL parsing, HTML retrieval and robot file parsing logging.warning( f"Unable to retrieve robots.txt from {start_url}. Reason: {e}") return None
def analyze_badbots(db, filename, max_entries=100000, max_bots=100): useragents = defaultdict(int) for i, entry in enumerate(db): parser = RobotFileParser(entry.url) parser.parse(entry.body.split("\n")) bans = [ e for e in parser.entries if len(e.rulelines) == 1 and not e.rulelines[0].allowance and e.rulelines[0].path == '/' ] for ban in bans: for useragent in ban.useragents: useragents[useragent] += 1 if i >= max_entries: break useragents = sorted(useragents.items(), key=lambda x: -x[1]) with open(filename, "w") as output: output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n") for useragent, count in useragents[:max_bots]: agenttype, info, company, homepage = BOT_TYPES.get( useragent, ('', '', '', '')) if not info: info = DEFAULT_USERAGENT_URL % useragent output.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % (useragent, count, agenttype, info, company, homepage)) return useragents
def get_robots(self): rp = RobotFileParser() if self.robots_content: rp.parse(self.robots_content) else: rp.allow_all = True return rp
def _init_robots(self, root_url): url = urlparse(root_url) robots_url = url.scheme + '://' + url.netloc + '/robots.txt' parser = RobotFileParser(robots_url) parser.read() return parser
def __init__(self, location: str, get_accredited: bool): self._robot_parser = RobotFileParser() self._robot_parser.set_url(urljoin(ICFG.STARTING_URL, 'robots.txt')) self._robot_parser.read() self._site_maps = self._robot_parser.site_maps() self.location = location.lower() self._acc = get_accredited
def __init__(self, robotstxt_body, spider): from urllib.robotparser import RobotFileParser self.spider = spider robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True) self.rp = RobotFileParser() self.rp.parse(robotstxt_body.splitlines())
def __init__(self, basicUrl, pageLimit, level): self._basicUrl = basicUrl self._robotParser = RobotFileParser() self._urlRegex = re.compile(basicUrl) self._pageCount = pageLimit #A limit of pages self._pageHeap = PriorityQueue() self._visited = set() self._level = level
def __init__(self, base_url): try: url = urljoin(base_url, 'robots.txt') self.rp = RobotFileParser() self.set_url(url) self.rp.read() except: self.rp = None
def http_open(self, request): url = request.get_full_url() host = urlsplit(url)[1] robots_url = urlunsplit(('http', host, '/robots.txt', '', '')) rp = RobotFileParser(robots_url) rp.read() if not rp.can_fetch(self.agentname, url): raise RuntimeError('Forbidden by robots.txt') return urllib2.HTTPHandler.http_open(self, request)
class Crawler(Thread): def __init__(self, scheduler, id): Thread.__init__(self) self.scheduler = scheduler self.robot_parser = RobotFileParser() self.running = True self.id = id def run(self): global RUNNING print(self.id,"running!") while RUNNING and self.running: url = self.scheduler.next() while url is None: sleep(2) url = self.scheduler.next() try: ''' Check robots.txt ''' parsed_url = urlparse(url) robots_url = parsed_url.scheme + "://" + parsed_url.netloc + "/robots.txt" robots_response = urlopen(robots_url, timeout=2) robots_file = robots_response.read() self.robot_parser.parse(robots_file.decode('utf-8').splitlines()) if not self.robot_parser.can_fetch("*", url): #print(self.id,"is not allowed to fetch",url) continue ''' Fetch the url ''' print(self.id,"->",url) response = urlopen(url, timeout=2) data = response.read().decode('utf-8', errors='ignore') data = data.split('href="') del data[0] data = sorted(data, key=len) except (HTTPError,TimeoutError,ConnectionResetError,UnicodeDecodeError,BaseException): data = [] except URLError as e: print(url,"->",str(e)) data = [] for d in data: d = d[:d.find('"')] if d.endswith('/'): d = d[:-1] ''' Format relative URLs ''' if not d.startswith("http://"): path = parsed_url.path url = path[:path.rfind("/")] while d.startswith("../"): url = url[:url.rfind("/")] d = d[3:] self.scheduler.add(parsed_url.scheme + \ "://" + parsed_url.netloc + \ url + "/" + d) else: self.scheduler.add(d)
def check_robot_txt(url): try: # proxy_handler = ProxyHandler( # { # 'http' : 'http://127.0.0.1:63231', # 'https' : 'http://127.0.0.1:63231' # } # ) # # opener = build_opener(proxy_handler) rp = RobotFileParser(url + '/robots.txt') rp.read() # rp.parse(urlopen(url.read().decode('utf-8').split('\n'))) print(rp.can_fetch('*', url + '/stock/')) # print("***************1") # response = opener.open('https://www.baidu.com') # print("***************2") # print(response.read().decode('utf-8')) # headers = {'User-Agent': 'Mozilla / 5.0(Macintosh;Intel Mac OS X) AppleWebKit / 537.36(KHTML, like Gecko)', # 'Host': 'httpbin.org'} # dict = {'name':'Tobi'} # # username= '******' # password= '******' # # url = 'http://localhost:5000' # # p = urllib.request.HTTPPasswordMgrWithDefaultRealm() # p.add_password(None,url,username,password) # auth_handler = urllib.request.HTTPBasicAuthHandler(p) # opener = urllib.request.build_opener(auth_handler) # # response = opener.open(url) # html = response.read().decode('utf-8') # print(html) # request = urllib.request.Request('http://httpbin.org/post', headers=headers, data=bytes(urllib.parse.urlencode(dict),encoding='utf8'), method='POST') # print("***************") # response = urllib.request.urlopen(request, context=ssl._create_unverified_context()) # if response.status != 200: # print('status : ' + response.status) # print("*******result********") # print(response.read().decode('utf-8')) # print(type(response)) # print(response.status) # print(response.getheaders()) # print(response.getheader('Server')) except URLError as e: # if isinstance(e.reason, socket.timeout): # print('time out!!!') print(e.reason) return
class Driver: def __init__(self, root_url: str, header: str, access_delay: int = 3, cookies: dict = None, logger=None): self.logger = logger self.root_url = root_url self.cookies = cookies self.header = header self.access_delay = access_delay self.now_content = None self.robots = None self.load_robots_txt() def load_robots_txt(self): self.robots = RobotFileParser() self.robots.set_url(self.root_url + '/robots.txt') self.robots.read() def get(self, path): try: sleep(self.access_delay) url = f'{self.root_url}/{path}' if self.robots.can_fetch("*", url): res = requests.get(url, headers=self.header, cookies=self.cookies) if self.logger is not None: self.logger.debug(f"Access to {url}.") self.now_content = BeautifulSoup(res.text, 'html.parser') else: if self.logger is not None: self.logger.warning( f"Access to this url is prohibited by robots.txt.\n<*>[URL={url}]" ) except Exception as e: if self.logger is not None: self.logger.warning(e) def find_element_by_class_name(self, name): return self.now_content.select('.' + name)[0] def find_elements_by_class_name(self, name): return self.now_content.select('.' + name) def find_element_by_id(self, name): return self.now_content.select('#' + name)[0] def find_elements_by_id(self, name): return self.now_content.select('#' + name) def find_element_by_tag(self, name): return self.now_content.find_all(name)
def can_fetch(self, url): parsed_url = urlparse(url) # Fetching and parsing the robots.txt file can be expensive in it-self. # Let's cache the RobotFileParser instances, one per host, on the # scraper itself to reuse them for consecutive queries. rfp = self.robot_file_parsers.get(parsed_url.hostname) if rfp is None: rfp = RobotFileParser(self.get_robot_url(url)) rfp.read() self.robot_file_parsers[parsed_url.hostname] = rfp return rfp.can_fetch(self.user_agent, parsed_url.path)
class CrawlController(object): def __init__(self): self._rp = RobotFileParser() def allow(self, p_robots_uri, p_target_uri): http = urllib3.PoolManager() r = http.request('GET', p_robots_uri) if r.data: self._rp.parse(r.data.decode('utf-8').splitlines()) return rp.can_fetch('*', p_target_uri) return True
def test_robots_txt(self): parser = RobotFileParser(self.live_server_url + '/robots.txt') parser.read() url = self.live_server_url + '/index.html' for robot in self.robots: self.assertTrue(parser.can_fetch(robot, url)) url = self.live_server_url + '/admin/' for robot in self.robots: self.assertFalse(parser.can_fetch(robot, url))
def __getParser(self, url): if url == '': return False site = 'https://' + url + '/robots.txt' # print("robotparse: " + site) try: rp = RobotFileParser(site) rp.read() except Exception as e: return False else: return rp
def check_robots(base_url, ext_url): ''' Check the robots.txt Prints note if base_url + ext_url is legal for crawling ''' bot = RobotFileParser(base_url + '/robots.txt') bot.read() if bot.can_fetch('*', base_url + ext_url): print('robots.txt permits parsing') else: print('Do not parse') return bot
def request(self, url: str) -> None: """ Perform robots.txt request """ if self.state is not None: return try: self.state = RobotFileParser() self.state.set_url(url) self.state.read() except Exception: self.state = False
def __init__(self, scheme: str, hostname: str): self.scheme = scheme self.hostname = hostname self.last_time = 0 self._urls = set() self._queue = deque() # parse robots.txt self._robot_parser = RobotFileParser() self._robot_parser.set_url("{}://{}/robots.txt".format( scheme, hostname)) self._robot_parser.read()
def filter_googlebot(entries): """ Given a bunch of robots.txt entries, figure out if googlebot is allowed but other random bots are banned. yields tuples of (entry, reppy.Robots) objects that match this condition""" for entry in entries: if entry.status_code != 200: continue parser = RobotFileParser(entry.url) parser.parse(entry.body.split("\n")) if parser.can_fetch("GoogleBot", "/") and not parser.can_fetch("BensCoolBot", "/"): yield entry, parser
def _robot_parser(self, txt, url): """Parses robots.txt with user-agent="*". :param txt: robots.txt to parse :param url: URL to check :returns: if url is allowed in robots.txt :rtype: bool """ parser = RobotFileParser() if txt: parser.parse(txt.decode("ascii", "replace").splitlines()) return parser.can_fetch("*", url) else: return True
class Exclusion(object): def __init__(self): self.robot_cache = {} self.rp = RobotFileParser() # Broken? disallows anything in the robots.txt. # Even if it is marked with Allow: def test_url(self, url): self.rp = RobotFileParser() robot_url = uu.domain_name(url) + '/robots.txt' self.rp.set_url(robot_url) self.rp.read() return self.rp.can_fetch('*', url)
def can_fetch(self, url): parse = urllib.parse.urlparse(url) hostname = parse.hostname try: robot = self.robot[hostname] except Exception: roboturl = urllib.parse.urlunparse((parse.scheme,parse.netloc,"robots.txt","","","")) robot = RobotFileParser(roboturl) try: robot.read() except Exception: robot = _RobotAllowAll() self.__robots[hostname] = robot return robot.can_fetch("*", url)
async def parse_robots(session, base): """Fetches and parses the robots.txt file from a given base URL. Returns an instance of RobotFileParser.""" url = urljoin(base, "robots.txt") async with session.get(url) as response: status = response.status text = await response.text() robot_parser = RobotFileParser() if status == 200: robot_parser.parse(text.splitlines()) else: robot_parser.allow_all = True return robot_parser
class RobotsTxt: def __init__(self, base_url): try: url = urljoin(base_url, 'robots.txt') self.rp = RobotFileParser() self.set_url(url) self.rp.read() except: self.rp = None def canFetch(self, url): if self.rp is None: return True else: return self.rp.can_fetch('*', url)
def __init__(self, url: str, api_key: str = None, authorizer=None): parsed_url = parse_url_from_string(url.rstrip("/")) self.host = parsed_url.host self.url = str(parsed_url) self.api_key = api_key or '' self._ignore_robots = False self.robots = RobotFileParser(urljoin(self.url, 'robots.txt')) self._authorizer = authorizer # Lazily-evaluated objects self._session = None self._taxonomies = None self._informationobjects = None self._virtual_taxonomies = None self._virtual_authorities = None
def add_robot(self, base_url): resp = download(base_url, self.config, self.logger) if resp.raw_response is not None: robot_list = resp.raw_response.content.decode().split("\n") # Adds the robots.txt in a global dictionary, returning the read robot.txt if base_url not in self.robots: robots_file = RobotFileParser() if resp.raw_response is not None and resp.status != 404: robots_file.parse(robot_list) self.robots[base_url] = robots_file return self.robots[base_url]
class RobotsTxt: def __init__(self) -> None: self.state = None # type: Any def allowed(self, url: URL) -> bool: # We don't have info about this domain for now, so we going to request # robots.txt if self.state is None: self.request(url.link("/robots.txt")) # We actually can't find out is there robots.txt or not # so we going to allow all in this case. if self.state is False or self.state.allow_all: return True if not self.state.last_checked and self.state.disallow_all: return False # find entry return allowed(matched_rules(self._entry(), url)) def request(self, url: str) -> None: """ Perform robots.txt request """ if self.state is not None: return try: self.state = RobotFileParser() self.state.set_url(url) self.state.read() except Exception: self.state = False # This is mostly transferred logics from robotparser.py, # but we trying to follow 2019 extension of the Google's Robots Txt # protocol and allow, disallowed pathes. # https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/ # https://tools.ietf.org/html/draft-koster-rep-04 def _entry(self) -> Any: for entry in self.state.entries: if entry.applies_to(user_agent): return entry return self.state.default_entry
def __init__(self, main_page=None, robotrules=True, threadcount=1): """ctor that checks args and decides to enable single or multithreaded generation of sitemap """ logging.info("Consider Robot.txt ? ==> "+str(robotrules)) self.robotrules = robotrules self.site_map = {} self.unvisited = set([]) self.start_page = None self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:"+main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever self.threadcount = int(threadcount)
def __init__(self, database, fetcher, analyzer, verbose=False): self.database = database self.fetcher = fetcher self.analyzer = analyzer self.verbose = verbose self.queue = set() self.robot_parser = RobotFileParser()
def rootFileParser(): rp = RobotFileParser() rp.set_url("http://www.jianshu.com/robots.txt") rp.read() print(rp.can_fetch("*", "https://www.jianshu.com/p/b67554025d7d")) print( rp.can_fetch( "*", "http://www.jianshu.com/search?q=python&page=1&type=collections"))
def get_robots(self, url, download_handler): robots_url = self._get_robots_url(url) (robots_url_info, robots_url_content) = self._get_robots_content( robots_url, download_handler) if robots_url_content is None: return None content_type, ct_attrs = cgi.parse_header( robots_url_info['content_type']) charset = ct_attrs.get('charset', None) if charset is None or charset == '': charset = 'utf-8' rf_parser = RobotFileParser() rf_parser.parse(robots_url_content.decode(charset).splitlines()) return rf_parser
def robot_parse(): rp = RobotFileParser() rp.set_url('http://www.jianshu.com/robots.txt') rp.read() print(rp.can_fetch('*', 'http://www.jianshu.com/p/')) print( rp.can_fetch( '*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))
def set_url(): # 通过 set_url() 方法设置 robots.txt 的链接。 rp = RobotFileParser() rp.set_url('http://www.bilibili.com/robots.txt') rp.read() # 使用 can_fetch() 方法判断网页是否可以被抓取。 print(rp.can_fetch('*', 'http://www.bilibili.com/vide/BV15J411T7WQ')) print( rp.can_fetch( '*', 'http://www.bilibili.com/search?q=python&page=1&type=collections'))
def GetRobotsTxt(url): rp = RobotFileParser() rp.set_url(url) rp.read() print( rp.can_fetch( '*', 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all')) print(rp.can_fetch('*', 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4')) print( rp.can_fetch( '*', 'https://book.douban.com/tag/%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91?start=40&type=S' ))
def can_read(url): domain = domain_name(url) if domain not in Permissions: rp = RobotFileParser() rp.set_url(urljoin("http://" + domain, "robots.txt")) try: rp.read() except: return False Permissions[domain] = rp res = False try: res = Permissions[domain].can_fetch("*", url) except: return False return res
def robotExclusion(link): # print("Robot exclusion.....") rp=RobotFileParser(); rp.set_url(urljoin(link, '/robot.txt')) rp.read() # print(rp.can_fetch("*", link)) return rp.can_fetch("*", link)
def check_robots_txt(self, parsed_link): """ Checks the site's robots.txt file to make sure our user agent is allowed to visit that url. :param parsed_link: :return: boolean . True if we're allowed to visit (or there's no robots.txt) """ rp = RobotFileParser() rp.set_url(urljoin(parsed_link.geturl(), '/robots.txt')) rp.read() return rp.can_fetch(self.user_agent, parsed_link.geturl())
def is_scraping_allowed(self): """ Megnezi, hogy a robots.txt nem tiltja-e a scrapelest. Nem igazan teljes az ellenorzes, mert csak az all job url-t vizsgalja. :return: """ robot_parser = RobotFileParser() robots_url = urljoin(self.base_url, 'robots.txt') robot_parser.set_url(robots_url) robot_parser.read() return robot_parser.can_fetch('*', urljoin( self.base_url, self.all_job_url))
def can_fetch_url(robots_url, site_url, useragent="*"): """ Using robots.txt found at robots_url, decides if useragent can fetch site url :param robots_url: robots.txt url :param site_url: to be fetched url :param useragent: useragent :return: True, if fetching is allowed """ rfp = RobotFileParser() rfp.set_url(robots_url) rfp.read() return rfp.can_fetch(useragent=useragent, url=site_url)
class SiteMap(): """ This class composes of all the functionalities needed to generate site_map""" def __init__(self, main_page=None, robotrules=True, threadcount=1): """ctor that checks args and decides to enable single or multithreaded generation of sitemap """ logging.info("Consider Robot.txt ? ==> "+str(robotrules)) self.robotrules = robotrules self.site_map = {} self.unvisited = set([]) self.start_page = None self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:"+main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever self.threadcount = int(threadcount) def execute(self): if self.threadcount <= 1: # if single threaded model is chosen, avoid threading self.generate() else: self.start() # fasten by multi threads def start(self): """This creates a pool of chosen limit so as to have the control and spawns the main function and waits until process and subsequently spawned process finish. """ self.pool = pool.Pool(self.threadcount) self.pool.spawn(self.generate_parallels) self.pool.join() self.generate_reports() def generate(self): """Non multithreaded model method that crawls until all pages are crawled and assets are extracted. Once its done, it creates the sitemap and assets json file for the given domain. """ while self.unvisited: self.crawl() self.generate_reports() def generate_reports(self): """composes the xml tags with the keys in site_map member which are nothing but the sitemap urls """ header = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> """ footer = """\n</urlset>\n""" entry = "\t<url><loc>%s</loc></url>\n" xml = header for url in self.site_map.keys(): xml += entry % (url) xml += footer name = self.start_page.replace(".", "_") self.create_file("%s.xml" % (name), xml) self.create_file("%s_assets.json" % (name), json.dumps(self.site_map, indent=2, sort_keys=True)) def generate_parallels(self): """ This method is similar to recursive in a way that crawls pages and clears the queue, which is self.unvisited. It stops when there are no urls to crawl and all threads in pool are empty i.e they are not active anymore due to finishing of crawling. Since its spawning a new thread and not calling directly, it is a nice way to go about it for now :) [Note:] There is a limit for recursion in Python and it can be increased by sys.setrecursionlimit(1500) An assumption has been made for this implementation that a website that has more than 500 nested links needs a bit higher design in terms to store the assets which might go above a hundred MB. In such cases, this can just be converted into a loop. More over, there is no extra stack variables. """ self.crawl() while len(self.unvisited) > 0 and not self.pool.full(): self.pool.spawn(self.generate_parallels) def create_file(self, file, content): """writes the given content to the file""" f = open(file, 'w') f.write(content) f.close() def compose_url_from_href(self, page, href): """composes a proper url from domainlink and intralinks with in the page""" url = urlparse(page) if href.startswith('/'): return "http://%s%s"%(url.netloc, href) elif href.startswith('#'): return "http://%s%s%s"%(url.netloc, url.path, href) elif href.startswith('./'): return "http://%s%s"%(url.netloc, href[1:]) elif not href.startswith('http'): return "http://" + url.netloc + '/' + href elif href.endswith('/'): return href[:-1] return href def get_out_going_links(self, page, html_body): """extracts all the outgoing links and adds links that belong to main page domain for further crawling if they are not crawled yet This avoids: - links that are .zip files - links mentioned in href that are javascript methods - mailto: links """ soup = BeautifulSoup(html_body, "html.parser") valid_links_for_this_page = [] for a in soup.find_all('a', href=True): href = a['href'].lower() href = self.compose_url_from_href(page, href) # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints href = urldefrag(href)[0] # skip intra links [this took time to find out !] ##1 # remove query params as only the path matters if href.find('?') != -1: href = href[:href.find('?')] ##2 new_page = urlparse(href) # add to the queue only it it doesn't cause a cycle # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete if not str(new_page.netloc).endswith(self.start_page): # doesn't belong to domain valid_links_for_this_page.append(href) continue if self.robot_allows(href) and \ not href in self.site_map and \ not href in self.unvisited and \ not 'javascript:' in href and \ not 'mailto:' in href: if not ( href.endswith(".zip") or href.endswith(".gz") or href.endswith(".gzip") or href.endswith(".tar") or href.endswith(".bz2") or href.endswith(".jpg") or href.endswith(".png") or href.endswith(".exe") ): self.unvisited.add(href) valid_links_for_this_page.append(href) return valid_links_for_this_page def get_assets(self, page, headers, html_body): """A nice feature of response header is that it reports the last-modified time of the link on the server. If we are doing regular crawling, we can avoid if the link is not updates since the last time. This method is useful for indexing the data so as to minimize the crawling effort to save execution time. It updates the site_map dictionary with the links, css, images and scripts """ if 'last-modified' in headers: date = headers['Last-Modified'] else: date = headers['Date'] soup = BeautifulSoup(html_body, "html.parser") img = soup.findAll("img") css = soup.findAll("link", {"rel": "stylesheet"}) js = soup.findAll('script') self.site_map[page] = { 'date': date, 'links': self.get_out_going_links(page, html_body), 'css': [c['href'] for c in css], 'img': [i['src'] for i in img], 'js': [x.get('src', 'inline jscode') for x in js] } def crawl(self): """This actually opens the url and calls the assets method """ if len(self.unvisited) <= 0: return page = self.unvisited.pop() if page in self.site_map: return logging.info("Starting to Crawl Page: " + page) try: response = self.access_page(page) if (response.status_code != 200): return None html_body = response.text self.get_assets(page, response.headers, html_body) except: logging.error("Issue while opening url: %s" + page) return None logging.debug("Crawled Pages: {}".format(len(self.site_map))) def access_page(self, url): """accesses the url from the server. This method was created to enable mock tests. """ return requests.get(url) def get_site_map(self): """exposes site_map""" return self.site_map def set_start_page(self, url): """sets the start page for the crawler""" self.start_page = url def robot_allows(self, link): """method to check if link can be accessed as per robot rules""" if not self.robotrules: return True try: if self.robot_txt_rules.can_fetch("*", link): return True return False except: return True
def __init__(self, scheduler, id): Thread.__init__(self) self.scheduler = scheduler self.robot_parser = RobotFileParser() self.running = True self.id = id
# Copyright(C) 2018 刘珅珅 # Environment: python 3.6.4 # Date: 2018.9.8 # robots协议测试 from urllib.robotparser import RobotFileParser url = 'http://www.jianshu.com/robots.txt' rp = RobotFileParser(url) rp.read() # 在scrapy中,如果希望爬取robots.txt禁止的页面,需要禁用robots print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) # False,不允许爬取
def robot_can_fetch(robots_txt_content, url): parser = RobotFileParser() parser.parse(robots_txt_content.splitlines()) return parser.can_fetch(USER_AGENT, urlparse(url).path)
class Crawler(): # Variables parserobots = False output = None report = False config = None domain = "" exclude = [] skipext = [] drop = [] debug = False tocrawl = set([]) crawled = set([]) excluded = set([]) marked = {} not_parseable_ressources = (".epub", ".mobi", ".docx", ".doc", ".opf", ".7z", ".ibooks", ".cbr", ".avi", ".mkv", ".mp4", ".jpg", ".jpeg", ".png", ".gif" ,".pdf", ".iso", ".rar", ".tar", ".tgz", ".zip", ".dmg", ".exe") # TODO also search for window.location={.*?} linkregex = re.compile(b'<a [^>]*href=[\'|"](.*?)[\'"][^>]*?>') imageregex = re.compile (b'<img [^>]*src=[\'|"](.*?)[\'"].*?>') rp = None response_code={} nb_url=1 # Number of url. nb_rp=0 # Number of url blocked by the robots.txt nb_exclude=0 # Number of url excluded by extension or word output_file = None target_domain = "" scheme = "" def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False, verbose=False, images=False): self.parserobots = parserobots self.output = output self.report = report self.domain = domain self.exclude = exclude self.skipext = skipext self.drop = drop self.debug = debug self.verbose = verbose self.images = images if self.debug: log_level = logging.DEBUG elif self.verbose: log_level = logging.INFO else: log_level = logging.ERROR logging.basicConfig(level=log_level) self.tocrawl = set([self.clean_link(domain)]) try: url_parsed = urlparse(domain) self.target_domain = url_parsed.netloc self.scheme = url_parsed.scheme except: logging.error("Invalide domain") raise ("Invalid domain") if self.output: try: self.output_file = open(self.output, 'w') except: logging.error ("Output file not available.") exit(255) def run(self): print(config.xml_header, file=self.output_file) if self.parserobots: self.check_robots() logging.info("Start the crawling process") while len(self.tocrawl) != 0: self.__crawling(first,domainname,all_link) logging.info("Crawling has reached end of all found links") print (config.xml_footer, file=self.output_file) def __crawling(self,frst,dmname,all_links): crawling = self.tocrawl.pop() if frst==0: dmname=obtaindomain(crawling) frst=1 url = urlparse(crawling) self.crawled.add(crawling) logging.info("Crawling #{}: {}".format(len(self.crawled), url.geturl())) request = Request(crawling, headers={"User-Agent":config.crawler_user_agent}) # Ignore ressources listed in the not_parseable_ressources # Its avoid dowloading file like pdf… etc if not url.path.endswith(self.not_parseable_ressources): try: response = urlopen(request) except Exception as e: if hasattr(e,'code'): if e.code in self.response_code: self.response_code[e.code]+=1 else: self.response_code[e.code]=1 # Gestion des urls marked pour le reporting if self.report: if e.code in self.marked: self.marked[e.code].append(crawling) else: self.marked[e.code] = [crawling] logging.debug ("{1} ==> {0}".format(e, crawling)) return self.__continue_crawling() else: logging.debug("Ignore {0} content might be not parseable.".format(crawling)) response = None # Read the response if response is not None: try: msg = response.read() if response.getcode() in self.response_code: self.response_code[response.getcode()]+=1 else: self.response_code[response.getcode()]=1 response.close() # Get the last modify date if 'last-modified' in response.headers: date = response.headers['Last-Modified'] else: date = response.headers['Date'] date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z') except Exception as e: logging.debug ("{1} ===> {0}".format(e, crawling)) return None else: # Response is None, content not downloaded, just continu and add # the link to the sitemap msg = "".encode( ) date = None # Image sitemap enabled ? image_list = ""; if self.images: # Search for images in the current page. images = self.imageregex.findall(msg) for image_link in list(set(images)): image_link = image_link.decode("utf-8", errors="ignore") # Ignore link starting with data: if image_link.startswith("data:"): continue # If path start with // get the current url scheme if image_link.startswith("//"): image_link = url.scheme + ":" + image_link # Append domain if not present elif not image_link.startswith(("http", "https")): if not image_link.startswith("/"): image_link = "/{0}".format(image_link) image_link = "{0}{1}".format(self.domain.strip("/"), image_link.replace("./", "/")) # Ignore image if path is in the exclude_url list if not self.exclude_url(image_link): continue # Ignore other domain images image_link_parsed = urlparse(image_link) if image_link_parsed.netloc != self.target_domain: continue # Test if images as been already seen and not present in the # robot file if self.can_fetch(image_link): logging.debug("Found image : {0}".format(image_link)) image_list = "{0}<image:image><image:loc>{1}</image:loc></image:image>".format(image_list, self.htmlspecialchars(image_link)) # Last mod fetched ? lastmod = "" if date: lastmod = "<lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod>" print ("<url><loc>"+self.htmlspecialchars(url.geturl())+"</loc>" + lastmod + image_list + "</url>", file=self.output_file) tempdom=obtaindomain(self.htmlspecialchars(url.geturl())) templink=self.htmlspecialchars(url.geturl()) if tempdom==dmname: if templink not in all_links: outputprint(templink) all_links.append(templink) if self.output_file: self.output_file.flush() # Found links links = self.linkregex.findall(msg) for link in links: link = link.decode("utf-8", errors="ignore") link = self.clean_link(link) logging.debug("Found : {0}".format(link)) if link.startswith('/'): link = url.scheme + '://' + url[1] + link elif link.startswith('#'): link = url.scheme + '://' + url[1] + url[2] + link elif link.startswith(("mailto", "tel")): continue elif not link.startswith(('http', "https")): link = url.scheme + '://' + url[1] + '/' + link # Remove the anchor part if needed if "#" in link: link = link[:link.index('#')] # Drop attributes if needed for toDrop in self.drop: link=re.sub(toDrop,'',link) # Parse the url to get domain and file extension parsed_link = urlparse(link) domain_link = parsed_link.netloc target_extension = os.path.splitext(parsed_link.path)[1][1:] if link in self.crawled: continue if link in self.tocrawl: continue if link in self.excluded: continue if domain_link != self.target_domain: continue if parsed_link.path in ["", "/"]: continue if "javascript" in link: continue if self.is_image(parsed_link.path): continue if parsed_link.path.startswith("data:"): continue # Count one more URL self.nb_url+=1 # Check if the navigation is allowed by the robots.txt if not self.can_fetch(link): self.exclude_link(link) self.nb_rp+=1 continue # Check if the current file extension is allowed or not. if (target_extension in self.skipext): self.exclude_link(link) self.nb_exclude+=1 continue # Check if the current url doesn't contain an excluded word if (not self.exclude_url(link)): self.exclude_link(link) self.nb_exclude+=1 continue self.tocrawl.add(link) return None def clean_link(self, link): l = urlparse(link) l_res = list(l) l_res[2] = l_res[2].replace("./", "/") l_res[2] = l_res[2].replace("//", "/") return urlunparse(l_res) def is_image(self, path): mt,me = mimetypes.guess_type(path) return mt is not None and mt.startswith("image/") def __continue_crawling(self): if self.tocrawl: self.__crawling(first,domainname,all_link) def exclude_link(self,link): if link not in self.excluded: self.excluded.add(link) def check_robots(self): robots_url = urljoin(self.domain, "robots.txt") self.rp = RobotFileParser() self.rp.set_url(robots_url) self.rp.read() def can_fetch(self, link): try: if self.parserobots: if self.rp.can_fetch("*", link): return True else: logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) return False if not self.parserobots: return True return True except: # On error continue! logging.debug ("Error during parsing robots.txt") return True def exclude_url(self, link): for ex in self.exclude: if ex in link: return False return True def htmlspecialchars(self, text): return text.replace("&", "&").replace('"', """).replace("<", "<").replace(">", ">") def make_report(self): print ("Number of found URL : {0}".format(self.nb_url)) print ("Number of link crawled : {0}".format(len(self.crawled))) if self.parserobots: print ("Number of link block by robots.txt : {0}".format(self.nb_rp)) if self.skipext or self.exclude: print ("Number of link exclude : {0}".format(self.nb_exclude)) for code in self.response_code: print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code])) for code in self.marked: print ("Link with status {0}:".format(code)) for uri in self.marked[code]: print ("\t- {0}".format(uri))
class _RobotAllowAll: def can_fetch(self, a,b): return True class Robot: def __init__(self): self.__robots = {} def can_fetch(self, url): parse = urllib.parse.urlparse(url) hostname = parse.hostname try: robot = self.robot[hostname] except Exception: roboturl = urllib.parse.urlunparse((parse.scheme,parse.netloc,"robots.txt","","","")) robot = RobotFileParser(roboturl) try: robot.read() except Exception: robot = _RobotAllowAll() self.__robots[hostname] = robot return robot.can_fetch("*", url) if __name__ == "__main__": r = RobotFileParser("http://www.letudiant.fr", {}) r.read() print(r.can_fetch("*", "http://www.letudiant.fr/"))
class Crawler(): # Variables parserobots = False output = None report = False config = None domain = "" exclude = [] skipext = [] drop = [] debug = False tocrawl = set([]) crawled = set([]) excluded = set([]) # TODO also search for window.location={.*?} linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>') rp = None response_code={} nb_url=1 # Number of url. nb_rp=0 # Number of url blocked by the robots.txt nb_exclude=0 # Number of url excluded by extension or word output_file = None target_domain = "" def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False): self.parserobots = parserobots self.output = output self.report = report self.domain = domain self.exclude = exclude self.skipext = skipext self.drop = drop self.debug = debug if self.debug: logging.basicConfig(level=logging.DEBUG) self.tocrawl = set([domain]) try: self.target_domain = urlparse(domain)[1] except: raise ("Invalid domain") if self.output: try: self.output_file = open(self.output, 'w') except: logging.debug ("Output file not available.") exit(255) def run(self): print (config.xml_header, file=self.output_file) logging.debug("Start the crawling process") self.__crawling() logging.debug("Crawling as reach the end of all found link") print (config.xml_footer, file=self.output_file) def __crawling(self): crawling = self.tocrawl.pop() url = urlparse(crawling) self.crawled.add(crawling) request = Request(crawling, headers={"User-Agent":config.crawler_user_agent}) try: response = urlopen(request) except Exception as e: if hasattr(e,'code'): if e.code in self.response_code: self.response_code[e.code]+=1 else: self.response_code[e.code]=1 logging.debug ("{1} ==> {0}".format(e, crawling)) return self.__continue_crawling() # Read the response try: msg = response.read() if response.getcode() in self.response_code: self.response_code[response.getcode()]+=1 else: self.response_code[response.getcode()]=1 response.close() # Get the last modify date if 'last-modified' in response.headers: date = response.headers['Last-Modified'] else: date = response.headers['Date'] date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z') except Exception as e: logging.debug ("{1} ===> {0}".format(e, crawling)) return self.__continue_crawling() print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S')+"</lastmod></url>", file=self.output_file) if self.output_file: self.output_file.flush() # Found links links = self.linkregex.findall(msg) for link in links: link = link.decode("utf-8") #logging.debug("Found : {0}".format(link)) if link.startswith('/'): link = 'http://' + url[1] + link elif link.startswith('#'): link = 'http://' + url[1] + url[2] + link elif not link.startswith('http'): link = 'http://' + url[1] + '/' + link # Remove the anchor part if needed if "#" in link: link = link[:link.index('#')] # Drop attributes if needed for toDrop in self.drop: link=re.sub(toDrop,'',link) # Parse the url to get domain and file extension parsed_link = urlparse(link) domain_link = parsed_link.netloc target_extension = os.path.splitext(parsed_link.path)[1][1:] if (link in self.crawled): continue if (link in self.tocrawl): continue if (link in self.excluded): continue if (domain_link != self.target_domain): continue if ("javascript" in link): continue # Count one more URL self.nb_url+=1 # Check if the navigation is allowed by the robots.txt if (not self.can_fetch(link)): self.exclude_link(link) self.nb_rp+=1 continue # Check if the current file extension is allowed or not. if (target_extension in self.skipext): self.exclude_link(link) self.nb_exclude+=1 continue # Check if the current url doesn't contain an excluded word if (not self.exclude_url(link)): self.exclude_link(link) self.nb_exclude+=1 continue self.tocrawl.add(link) return self.__continue_crawling() def __continue_crawling(self): if self.tocrawl: self.__crawling() def exclude_link(self,link): if link not in self.excluded: self.excluded.add(link) def checkRobots(self): if self.domain[len(self.domain)-1] != "/": self.domain += "/" request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) self.rp = RobotFileParser() self.rp.set_url(self.domain+"robots.txt") self.rp.read() def can_fetch(self, link): try: if self.parserobots: if self.rp.can_fetch("*", link): return True else: logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) return False if not self.parserobots: return True return True except: # On error continue! logging.debug ("Error during parsing robots.txt") return True def exclude_url(self, link): for ex in self.exclude: if ex in link: return False return True def make_report(self): print ("Number of found URL : {0}".format(self.nb_url)) print ("Number of link crawled : {0}".format(len(self.crawled))) if self.parserobots: print ("Number of link block by robots.txt : {0}".format(self.nb_rp)) if self.skipext or self.exclude: print ("Number of link exclude : {0}".format(self.nb_exclude)) for code in self.response_code: print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
def check_robots(self): robots_url = urljoin(self.domain, "robots.txt") self.rp = RobotFileParser() self.rp.set_url(robots_url) self.rp.read()
def _get_robot_parser(self): parser = RobotFileParser() parser.set_url(self.protocol + "://" + self.domain + "/robots.txt") return parser
class Crawler(object): def __init__(self, database, fetcher, analyzer, verbose=False): self.database = database self.fetcher = fetcher self.analyzer = analyzer self.verbose = verbose self.queue = set() self.robot_parser = RobotFileParser() def crawl(self, url): """Begin recursively crawling pages starting from the given URL. :param url: Starting URL :returns: None """ if self.database.is_page_stored(url): print("Page is already crawled. Use --flush to flush the database file.", file=sys.stderr) else: # Because crawling is restricted to pages on the same domain, the # robots.txt file can be loaded once at the beginning of the crawl self.load_robots_file(url) # Add the starting URL to the queue of pages to be crawled, and # then keep crawling while there are still URLs in the queue self.queue.add(url) while len(self.queue) > 0: self.crawl_one(self.queue.pop()) def crawl_one(self, url): """Fetch a single page and analyze it for links. The found triples are stored in the database, and found links that should be crawled are added to the queue. :param url: The page to fetch and analyze :returns: None """ if self.verbose: print(url, file=sys.stderr) status, html = self.fetcher.fetch(url) if status is None: # The status code will be None if retrieval failed print("Failed to get {}".format(url), file=sys.stderr) else: # Search for links and images in the page, and get them as triples # of (page URL, link type, link URL) triples = self.analyzer.analyze(url, html) self.database.store_triples(triples) # Any linked URLs that are eligible for crawling are added to the # pending crawl queue for page_url, link_type, link_url in triples: if self.should_crawl(page_url, link_type, link_url): self.queue.add(link_url) def should_crawl(self, page_url, link_type, link_url): """Determine whether a URL should be crawled. :param page_url: The page the link came from. :param link_type: The type of link URL. :param link_url: The link URL to test. :returns: True if the link URL should be crawled, otherwise False. """ # Only HTML pages should be crawled, not other media if link_type not in ('page', 'iframe'): return False # The link should be on the same domain as the page it's linked from if not self.have_same_domain(page_url, link_url): return False # Fetching the link URL should be permitted by robots.txt if not self.robot_parser.can_fetch('Cosmo', link_url): return False # The linked page should not have been crawled already if self.database.is_page_stored(link_url): return False return True def have_same_domain(self, url1, url2): """Test whether two URLs have the same hostname and port. :returns: True if they do, otherwise False """ return urlparse(url1).netloc == urlparse(url2).netloc def load_robots_file(self, url): """Load the /robots.txt file for the given URL by reusing the scheme and authority parts. :param url: The URL from which to take the scheme and authority parts. :returns: None """ # Create a new URL with the same scheme, host and port, but with a # path of /robots.txt parsed = urlparse(url) robots_url = urlunparse((parsed.scheme, parsed.netloc, '/robots.txt', '', '', '')) # Load the robots.txt file using the requests library, because we need # to specify the User-Agent header. I noticed on a CloudFlare-fronted # site that it returns a 403 for /robots.txt if the the user agent is # Python-urllib, but 200 if it's Cosmo. status, robots_file = self.fetcher.fetch(robots_url) if status in (401, 403): self.robot_parser.disallow_all = True elif status >= 400: self.robot_parser.allow_all = True else: self.robot_parser.parse(robots_file.splitlines())