def analyze_badbots(db, filename, max_entries=100000, max_bots=100): useragents = defaultdict(int) for i, entry in enumerate(db): parser = RobotFileParser(entry.url) parser.parse(entry.body.split("\n")) bans = [e for e in parser.entries if len(e.rulelines) == 1 and not e.rulelines[0].allowance and e.rulelines[0].path == '/'] for ban in bans: for useragent in ban.useragents: useragents[useragent] += 1 if i >= max_entries: break useragents = sorted(useragents.items(), key=lambda x: -x[1]) with open(filename, "w") as output: output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n") for useragent, count in useragents[:max_bots]: agenttype, info, company, homepage = BOT_TYPES.get(useragent, ('', '', '', '')) if not info: info = DEFAULT_USERAGENT_URL % useragent output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (useragent, count, agenttype, info, company, homepage)) return useragents
def is_allowed(self): if self.site.robots_status==200: parser = RobotFileParser() lines = io.StringIO(self.site.robots).readlines() parser.parse(lines) return parser.can_fetch(settings.USER_AGENT, self.get_url()) return True
def get_robot_file_parser(start_url: str, **kwargs) -> Union[RobotFileParser, None]: """Returns :class:`~python:urllib.robotparser.RobotFileParser` object from given URL. If no ``robots.txt`` file is found or error occurs, returns ``None``. :param start_url: URL from which ``robots.txt`` will be collected. :param kwargs: Will be passed to :func:`get_html`. .. seealso:: :func:`async_get_robot_file_parser` """ try: parsed_url = ParsedUrl(start_url) robot_txt_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" rp = RobotFileParser(robot_txt_url) text = get_html(robot_txt_url, check_http_content_type=False, return_response_object=False, raise_for_status=True, **kwargs) lines = [line.strip() for line in text.split("\n") if line != ''] rp.parse(lines) return rp except Exception as e: # Exceptions from URL parsing, HTML retrieval and robot file parsing logging.warning( f"Unable to retrieve robots.txt from {start_url}. Reason: {e}") return None
def analyze_badbots(db, filename, max_entries=100000, max_bots=100): useragents = defaultdict(int) for i, entry in enumerate(db): parser = RobotFileParser(entry.url) parser.parse(entry.body.split("\n")) bans = [ e for e in parser.entries if len(e.rulelines) == 1 and not e.rulelines[0].allowance and e.rulelines[0].path == '/' ] for ban in bans: for useragent in ban.useragents: useragents[useragent] += 1 if i >= max_entries: break useragents = sorted(useragents.items(), key=lambda x: -x[1]) with open(filename, "w") as output: output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n") for useragent, count in useragents[:max_bots]: agenttype, info, company, homepage = BOT_TYPES.get( useragent, ('', '', '', '')) if not info: info = DEFAULT_USERAGENT_URL % useragent output.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % (useragent, count, agenttype, info, company, homepage)) return useragents
def get_robots(self): rp = RobotFileParser() if self.robots_content: rp.parse(self.robots_content) else: rp.allow_all = True return rp
def _robots(self): robots = RobotFileParser() r = fetch_raw(self.url.site + 'robots.txt', strict=False) if r is None: robots.parse(self.DEFAULT_ROBOTS.splitlines()) else: robots.parse(r.text.splitlines()) return robots
class Crawler(Thread): def __init__(self, scheduler, id): Thread.__init__(self) self.scheduler = scheduler self.robot_parser = RobotFileParser() self.running = True self.id = id def run(self): global RUNNING print(self.id,"running!") while RUNNING and self.running: url = self.scheduler.next() while url is None: sleep(2) url = self.scheduler.next() try: ''' Check robots.txt ''' parsed_url = urlparse(url) robots_url = parsed_url.scheme + "://" + parsed_url.netloc + "/robots.txt" robots_response = urlopen(robots_url, timeout=2) robots_file = robots_response.read() self.robot_parser.parse(robots_file.decode('utf-8').splitlines()) if not self.robot_parser.can_fetch("*", url): #print(self.id,"is not allowed to fetch",url) continue ''' Fetch the url ''' print(self.id,"->",url) response = urlopen(url, timeout=2) data = response.read().decode('utf-8', errors='ignore') data = data.split('href="') del data[0] data = sorted(data, key=len) except (HTTPError,TimeoutError,ConnectionResetError,UnicodeDecodeError,BaseException): data = [] except URLError as e: print(url,"->",str(e)) data = [] for d in data: d = d[:d.find('"')] if d.endswith('/'): d = d[:-1] ''' Format relative URLs ''' if not d.startswith("http://"): path = parsed_url.path url = path[:path.rfind("/")] while d.startswith("../"): url = url[:url.rfind("/")] d = d[3:] self.scheduler.add(parsed_url.scheme + \ "://" + parsed_url.netloc + \ url + "/" + d) else: self.scheduler.add(d)
def parse(): # 使用 parse() 方法执行读取和分析 rp = RobotFileParser() rp.parse( urlopen('http://www.bilibili.com/robots.txt').read().decode( 'utf-8').split('\n')) print(rp.can_fetch('*', 'http://www.bilibili.com/vide/BV15J411T7WQ')) print( rp.can_fetch( '*', 'http://www.bilibili.com/search?q=python&page=1&type=collections'))
class CrawlController(object): def __init__(self): self._rp = RobotFileParser() def allow(self, p_robots_uri, p_target_uri): http = urllib3.PoolManager() r = http.request('GET', p_robots_uri) if r.data: self._rp.parse(r.data.decode('utf-8').splitlines()) return rp.can_fetch('*', p_target_uri) return True
def _robot_parser(self, txt, url): """Parses robots.txt with user-agent="*". :param txt: robots.txt to parse :param url: URL to check :returns: if url is allowed in robots.txt :rtype: bool """ parser = RobotFileParser() if txt: parser.parse(txt.decode("ascii", "replace").splitlines()) return parser.can_fetch("*", url) else: return True
def filter_googlebot(entries): """ Given a bunch of robots.txt entries, figure out if googlebot is allowed but other random bots are banned. yields tuples of (entry, reppy.Robots) objects that match this condition""" for entry in entries: if entry.status_code != 200: continue parser = RobotFileParser(entry.url) parser.parse(entry.body.split("\n")) if parser.can_fetch("GoogleBot", "/") and not parser.can_fetch("BensCoolBot", "/"): yield entry, parser
def _get_robots(self, domain: Hyperlink) -> RobotFileParser: """get the robots.txt from any domain""" robots_url = domain.with_path("robots.txt") robots = RobotFileParser(str(robots_url)) # try and get /robots.txt and parse except error we assume none try: resp = self._requester(robots_url, mime_types=("text/plain",)) robots.parse(resp.text.splitlines()) except (ClientError, ServerError, WrongMIMEType): robots.parse("") return robots
async def parse_robots(session, base): """Fetches and parses the robots.txt file from a given base URL. Returns an instance of RobotFileParser.""" url = urljoin(base, "robots.txt") async with session.get(url) as response: status = response.status text = await response.text() robot_parser = RobotFileParser() if status == 200: robot_parser.parse(text.splitlines()) else: robot_parser.allow_all = True return robot_parser
def add_robot(self, base_url): resp = download(base_url, self.config, self.logger) if resp.raw_response is not None: robot_list = resp.raw_response.content.decode().split("\n") # Adds the robots.txt in a global dictionary, returning the read robot.txt if base_url not in self.robots: robots_file = RobotFileParser() if resp.raw_response is not None and resp.status != 404: robots_file.parse(robot_list) self.robots[base_url] = robots_file return self.robots[base_url]
def get_robotstxt_parser(url, session=None): """Get a RobotFileParser for the given robots.txt URL.""" rp = RobotFileParser() try: req = urlopen(url, session, max_content_bytes=MaxContentBytes, allow_errors=range(600)) except Exception: # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True else: if req.status_code >= 400: rp.allow_all = True elif req.status_code == 200: rp.parse(req.text.splitlines()) return rp
def get_robots(self, url, download_handler): robots_url = self._get_robots_url(url) (robots_url_info, robots_url_content) = self._get_robots_content( robots_url, download_handler) if robots_url_content is None: return None content_type, ct_attrs = cgi.parse_header( robots_url_info['content_type']) charset = ct_attrs.get('charset', None) if charset is None or charset == '': charset = 'utf-8' rf_parser = RobotFileParser() rf_parser.parse(robots_url_content.decode(charset).splitlines()) return rf_parser
class PythonRobotParser(RobotParser): def __init__(self, robotstxt_body, spider): from urllib.robotparser import RobotFileParser self.spider = spider robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True) self.rp = RobotFileParser() self.rp.parse(robotstxt_body.splitlines()) @classmethod def from_crawler(cls, crawler, robotstxt_body): spider = None if not crawler else crawler.spider o = cls(robotstxt_body, spider) return o def allowed(self, url, user_agent): user_agent = to_unicode(user_agent) url = to_unicode(url) return self.rp.can_fetch(user_agent, url)
def can_crawl(self, url): robots_path = self.get_robots_path(url) parser = RobotFileParser() if self.cache.get(robots_path) is None: robots_content = self.read_robots(robots_path) self.cache[robots_path] = robots_content else: robots_content = self.cache.get(robots_path) if robots_content is None: return True if robots_content is False: return False parser.parse(robots_content) return parser.can_fetch("*", url)
def get_robots_parser(self, url: str): rp = RobotFileParser() if self.store.exists(url, 'txt'): body = self.store.load_url(url, 'txt') else: page, status_code = download_page(url, 'Robot') body = page.body if status_code in [401, 403]: body = self.DISALLOW_ALL elif 400 <= status_code < 500: # including status_code 404 body = self.ALLOW_ALL self.store.save_url(url, body, 'txt') if body.strip() == self.ALLOW_ALL: rp.allow_all = True elif body.strip() == self.DISALLOW_ALL: rp.disallow_all = True else: rp.parse(body.decode('utf-8').splitlines()) return rp
def get_robots(es, domain): # Someone mentioned at Camp SF that they supported both http and https # for better coverage of older clients. This keeps that in mind. for protocol in ['https', 'http']: try: url = f"{protocol}://{domain}/robots.txt" doc = refresh_archive(es, url)[0] if doc['status_code'] == 200: lines = doc['content'].splitlines() robots = RobotFileParser(url=url) robots.parse(lines) return robots else: raise ValueError(f"Status code {doc['status_code']}") except KeyboardInterrupt: raise except Exception as e: LOGGER.warning(f"Unable to fetch {url}: {e}") return make_robots_allow_all()
def checkRobot(uri): parsedUrl = urlparse(uri) if parsedUrl.scheme == "" or parsedUrl.netloc == "": return None, None robotsUrl = str(parsedUrl.scheme) + "://" + str( parsedUrl.netloc) + "/robots.txt" try: req = requests.get(url=robotsUrl) except requests.exceptions.SSLError: return True, "SSL error" except requests.exceptions.ConnectionError: return True, "Connection Error" except requests.exceptions.InvalidSchema: return True, f"Invalid schema: {robotsUrl}" if req.status_code > 400: # if robots.txt is not accessible, we are allowed return True, None rp = RobotFileParser() rp.set_url(robotsUrl) rp.parse(req.text.split("\n")) if rp.can_fetch(archivoConfig.archivo_agent, uri): return True, None else: return False, "Not allowed"
from urllib.robotparser import RobotFileParser from urllib.request import urlopen rp = RobotFileParser() rp.parse( urlopen('https://cuiqingcai.com//robots.txt').read().decode('utf-8').split( '\n')) print(rp.can_fetch('*', 'https://cuiqingcai.com/1052.html'))
from urllib.robotparser import RobotFileParser from urllib.request import urlopen rp = RobotFileParser() rp.parse(urlopen) print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) print(rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))
# from urllib.robotparser import RobotFileParser # # rp = RobotFileParser() # rp.set_url('http://www.jianshu.com/robots.txt') # rp.read() # print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) # print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections")) from urllib.robotparser import RobotFileParser from urllib.request import urlopen rp = RobotFileParser() rp.parse( urlopen('https://blog.csdn.net/robots.txt').read().decode('utf-8').split( '\n')) print( rp.can_fetch('*', 's://blog.csdn.net/Linear_Luo/article/details/52231550'))
#robots.txt 实例 爬虫协议 机器人协议 # User-agent:* # Disallow:/ # Allow:/public/ # User-agent:Baiduspider # set_url(),用来设置 robots.txt 文件的链接。如果已经在创建 RobotFileParser 对象时传入了链接,那就不需要再使用这个方法设置了。 # read(),读取 robots.txt 文件并进行分析,注意这个函数是执行一个读取和分析操作,如果不调用这个方法,接下来的判断都会为 False,所以一定记得调用这个方法,这个方法不会返回任何内容,但是执行了读取操作。 # parse(),用来解析 robots.txt 文件,传入的参数是 robots.txt 某些行的内容,它会按照 robots.txt 的语法规则来分析这些内容。 # can_fetch(),方法传入两个参数,第一个是 User-agent,第二个是要抓取的 URL,返回的内容是该搜索引擎是否可以抓取这个 URL,返回结果是 True 或 False。 # mtime(),返回的是上次抓取和分析 robots.txt 的时间,这个对于长时间分析和抓取的搜索爬虫是很有必要的,你可能需要定期检查来抓取最新的 robots.txt。 # modified(),同样的对于长时间分析和抓取的搜索爬虫很有帮助,将当前时间设置为上次抓取和分析 robots.txt 的时间。 from urllib.robotparser import RobotFileParser from urllib.request import urlopen rp = RobotFileParser() # rp.set_url('http://www.jianshu.com/robots.txt') # rp.read() # print(rp.can_fetch('*', 'https://www.jianshu.com/p/11046c89367d')) # print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections")) print( '----------------------------------------------------------------------------------' ) rp.parse( urlopen('http://www.jianshu.com/robots.txt').read().decode('utf-8').split( '\n')) print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) print( rp.can_fetch( '*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))
from urllib.robotparser import RobotFileParser rp = RobotFileParser() # 可以直接写url: rp = RobotFileParser('http://www.jianshu.com/robots.txt') rp.set_url('http://www.jianshu.com/robots.txt') # 读取robots.txt文件并进行分析,注意,这个方法执行一个读取和分析操作,如果不调用这个方法,接下来的判断都会为false rp.read() # 使用can_fetch方法判断网页是否可以被抓取 print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) print( rp.can_fetch( '*', 'http://www.jianshu.com/search?q=python&page=1&type=collections')) # 使用parse()方法执行读取和分析。 from urllib.robotparser import RobotFileParser from urllib.request import urlopen rp_parse = RobotFileParser() rp_parse.parse( urlopen('http://www.baidu.com/robots.txt').read().decode('utf-8').split( '\n')) print(rp_parse.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) print( rp_parse.can_fetch( '*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))
def benchmark_python_parser(website): rp = RobotFileParser() rp.parse(website['robotstxt'].splitlines()) for link in website['links']: rp.can_fetch('googlebot', link)
from urllib.robotparser import RobotFileParser from urllib.request import urlopen def rootFileParser(): rp = RobotFileParser() rp.set_url("http://www.jianshu.com/robots.txt") rp.read() print(rp.can_fetch("*", "https://www.jianshu.com/p/b67554025d7d")) print( rp.can_fetch( "*", "http://www.jianshu.com/search?q=python&page=1&type=collections")) rootFileParser() rp = RobotFileParser() rp.parse( urlopen("http://www.jianshu.com/robots.txt").read().decode("utf-8").split( "\n")) print(rp.can_fetch("*", "http://www.jianshu.com/p/b67554025d7d")) print( rp.can_fetch( "*", "http://www.jianshu.com/search?q=python&page=1&type=collections"))
from urllib import request from urllib.robotparser import RobotFileParser from urllib.request import urlopen url = 'http://www.jianshu.com/robots.txt' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', } req = request.Request(url=url, headers=headers, method='GET') robot_txt = urlopen(req).read().decode('utf-8').split('\n') rp = RobotFileParser() rp.parse(robot_txt) print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d')) print( rp.can_fetch( '*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))
from urllib.robotparser import RobotFileParser from urllib.request import urlopen rp = RobotFileParser() rp.set_url('https://www.jianshu.com/robots.txt') # rp = RobotFileParser('https://www.jianshu.com/robots.txt') rp.read() print(rp.can_fetch('*','https://www.jianshu.com/p/b67554025d7d/')) print(rp.can_fetch('*','https://www.jianshu.com')) ''' rp = RobotFileParser() rp.parse(urlopen('https://www.jianshu.com/robots.txt').read().decode('utf-8').split('\n')) print(rp.can_fetch('*','https://www.jianshu.com/p/b67554025d7d/')) print(rp.can_fetch('*','https://www.jianshu.com')) '''
def _get_robots_parser(additional_content: str = '') -> RobotFileParser: new_content = robots_content + additional_content robots = RobotFileParser() robots.parse(new_content.split('\n')) return robots
def robot_can_fetch(robots_txt_content, url): parser = RobotFileParser() parser.parse(robots_txt_content.splitlines()) return parser.can_fetch(USER_AGENT, urlparse(url).path)
from urllib.robotparser import RobotFileParser from urllib.request import urlopen, Request import ssl ssl._create_default_https_context = ssl._create_unverified_context rp = RobotFileParser() headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:67.0) Gecko/20100101 Firefox/67.0' } req = Request(url='https://www.jianshu.com/robots.txt', headers=headers) rp.parse(urlopen(req).read().decode('utf-8').split('\n')) print(rp.can_fetch('*', 'https://www.jianshu.com/p/b67554025d7d')) print( rp.can_fetch( '*', "https://www.jianshu.com/search?q=python&page=1&type=collections"))
class Crawler(object): def __init__(self, database, fetcher, analyzer, verbose=False): self.database = database self.fetcher = fetcher self.analyzer = analyzer self.verbose = verbose self.queue = set() self.robot_parser = RobotFileParser() def crawl(self, url): """Begin recursively crawling pages starting from the given URL. :param url: Starting URL :returns: None """ if self.database.is_page_stored(url): print("Page is already crawled. Use --flush to flush the database file.", file=sys.stderr) else: # Because crawling is restricted to pages on the same domain, the # robots.txt file can be loaded once at the beginning of the crawl self.load_robots_file(url) # Add the starting URL to the queue of pages to be crawled, and # then keep crawling while there are still URLs in the queue self.queue.add(url) while len(self.queue) > 0: self.crawl_one(self.queue.pop()) def crawl_one(self, url): """Fetch a single page and analyze it for links. The found triples are stored in the database, and found links that should be crawled are added to the queue. :param url: The page to fetch and analyze :returns: None """ if self.verbose: print(url, file=sys.stderr) status, html = self.fetcher.fetch(url) if status is None: # The status code will be None if retrieval failed print("Failed to get {}".format(url), file=sys.stderr) else: # Search for links and images in the page, and get them as triples # of (page URL, link type, link URL) triples = self.analyzer.analyze(url, html) self.database.store_triples(triples) # Any linked URLs that are eligible for crawling are added to the # pending crawl queue for page_url, link_type, link_url in triples: if self.should_crawl(page_url, link_type, link_url): self.queue.add(link_url) def should_crawl(self, page_url, link_type, link_url): """Determine whether a URL should be crawled. :param page_url: The page the link came from. :param link_type: The type of link URL. :param link_url: The link URL to test. :returns: True if the link URL should be crawled, otherwise False. """ # Only HTML pages should be crawled, not other media if link_type not in ('page', 'iframe'): return False # The link should be on the same domain as the page it's linked from if not self.have_same_domain(page_url, link_url): return False # Fetching the link URL should be permitted by robots.txt if not self.robot_parser.can_fetch('Cosmo', link_url): return False # The linked page should not have been crawled already if self.database.is_page_stored(link_url): return False return True def have_same_domain(self, url1, url2): """Test whether two URLs have the same hostname and port. :returns: True if they do, otherwise False """ return urlparse(url1).netloc == urlparse(url2).netloc def load_robots_file(self, url): """Load the /robots.txt file for the given URL by reusing the scheme and authority parts. :param url: The URL from which to take the scheme and authority parts. :returns: None """ # Create a new URL with the same scheme, host and port, but with a # path of /robots.txt parsed = urlparse(url) robots_url = urlunparse((parsed.scheme, parsed.netloc, '/robots.txt', '', '', '')) # Load the robots.txt file using the requests library, because we need # to specify the User-Agent header. I noticed on a CloudFlare-fronted # site that it returns a 403 for /robots.txt if the the user agent is # Python-urllib, but 200 if it's Cosmo. status, robots_file = self.fetcher.fetch(robots_url) if status in (401, 403): self.robot_parser.disallow_all = True elif status >= 400: self.robot_parser.allow_all = True else: self.robot_parser.parse(robots_file.splitlines())