예제 #1
0
def analyze_badbots(db, filename, max_entries=100000, max_bots=100):
    useragents = defaultdict(int)

    for i, entry in enumerate(db):
        parser = RobotFileParser(entry.url)
        parser.parse(entry.body.split("\n"))

        bans = [e for e in parser.entries if len(e.rulelines) == 1 and
                not e.rulelines[0].allowance and e.rulelines[0].path == '/']

        for ban in bans:
            for useragent in ban.useragents:
                useragents[useragent] += 1

        if i >= max_entries:
            break

    useragents = sorted(useragents.items(), key=lambda x: -x[1])
    with open(filename, "w") as output:
        output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n")
        for useragent, count in useragents[:max_bots]:
            agenttype, info, company, homepage = BOT_TYPES.get(useragent, ('', '', '', ''))
            if not info:
                info = DEFAULT_USERAGENT_URL % useragent
            output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (useragent, count, agenttype, info,
                                                       company, homepage))

    return useragents
예제 #2
0
파일: models.py 프로젝트: Towhidn/quaero
	def is_allowed(self):
		if self.site.robots_status==200:
			parser = RobotFileParser()
			lines = io.StringIO(self.site.robots).readlines()
			parser.parse(lines)
			return parser.can_fetch(settings.USER_AGENT, self.get_url())
		return True
예제 #3
0
def get_robot_file_parser(start_url: str,
                          **kwargs) -> Union[RobotFileParser, None]:
    """Returns :class:`~python:urllib.robotparser.RobotFileParser` object from given URL.
    If no ``robots.txt`` file is found or error occurs, returns ``None``.

    :param start_url: URL from which ``robots.txt`` will be collected.
    :param kwargs: Will be passed to :func:`get_html`.

    .. seealso:: :func:`async_get_robot_file_parser`
    """
    try:
        parsed_url = ParsedUrl(start_url)

        robot_txt_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        rp = RobotFileParser(robot_txt_url)

        text = get_html(robot_txt_url,
                        check_http_content_type=False,
                        return_response_object=False,
                        raise_for_status=True,
                        **kwargs)

        lines = [line.strip() for line in text.split("\n") if line != '']
        rp.parse(lines)

        return rp
    except Exception as e:  # Exceptions from URL parsing, HTML retrieval and robot file parsing
        logging.warning(
            f"Unable to retrieve robots.txt from {start_url}. Reason: {e}")
        return None
예제 #4
0
def analyze_badbots(db, filename, max_entries=100000, max_bots=100):
    useragents = defaultdict(int)

    for i, entry in enumerate(db):
        parser = RobotFileParser(entry.url)
        parser.parse(entry.body.split("\n"))

        bans = [
            e for e in parser.entries if len(e.rulelines) == 1
            and not e.rulelines[0].allowance and e.rulelines[0].path == '/'
        ]

        for ban in bans:
            for useragent in ban.useragents:
                useragents[useragent] += 1

        if i >= max_entries:
            break

    useragents = sorted(useragents.items(), key=lambda x: -x[1])
    with open(filename, "w") as output:
        output.write("useragent\tcount\ttype\tinfolink\tcompany\thomepage\n")
        for useragent, count in useragents[:max_bots]:
            agenttype, info, company, homepage = BOT_TYPES.get(
                useragent, ('', '', '', ''))
            if not info:
                info = DEFAULT_USERAGENT_URL % useragent
            output.write(
                "%s\t%s\t%s\t%s\t%s\t%s\n" %
                (useragent, count, agenttype, info, company, homepage))

    return useragents
예제 #5
0
 def get_robots(self):
     rp = RobotFileParser()
     if self.robots_content:
         rp.parse(self.robots_content)
     else:
         rp.allow_all = True
     return rp
예제 #6
0
 def _robots(self):
     robots = RobotFileParser()
     r = fetch_raw(self.url.site + 'robots.txt', strict=False)
     if r is None:
         robots.parse(self.DEFAULT_ROBOTS.splitlines())
     else:
         robots.parse(r.text.splitlines())
     return robots
예제 #7
0
class Crawler(Thread):
	def __init__(self, scheduler, id):
		Thread.__init__(self)
		self.scheduler = scheduler
		self.robot_parser = RobotFileParser()
		self.running = True
		self.id = id

	def run(self):
		global RUNNING
		print(self.id,"running!")
		while RUNNING and self.running:
			url = self.scheduler.next()
			while url is None:
				sleep(2)
				url = self.scheduler.next()
				
			try:
				''' Check robots.txt '''
				parsed_url = urlparse(url)
				robots_url = parsed_url.scheme + "://" + parsed_url.netloc + "/robots.txt"
				robots_response = urlopen(robots_url, timeout=2)
				robots_file = robots_response.read()
				self.robot_parser.parse(robots_file.decode('utf-8').splitlines())
				if not self.robot_parser.can_fetch("*", url):
					#print(self.id,"is not allowed to fetch",url)
					continue
			
				''' Fetch the url '''
				print(self.id,"->",url)
				response = urlopen(url, timeout=2)
				data = response.read().decode('utf-8', errors='ignore')
				data = data.split('href="')
				del data[0]
				data = sorted(data, key=len)
			except (HTTPError,TimeoutError,ConnectionResetError,UnicodeDecodeError,BaseException):
				data = []
			except URLError as e:
				print(url,"->",str(e))
				data = []
				
			for d in data:
				d = d[:d.find('"')]
				if d.endswith('/'):
					d = d[:-1]
				''' Format relative URLs '''
				if not d.startswith("http://"):
					path = parsed_url.path
					url = path[:path.rfind("/")]
					while d.startswith("../"):
						url = url[:url.rfind("/")]
						d = d[3:]
					self.scheduler.add(parsed_url.scheme + \
										"://" + parsed_url.netloc + \
										url + "/" + d)
				else:
					self.scheduler.add(d)
def parse():  # 使用 parse() 方法执行读取和分析
    rp = RobotFileParser()
    rp.parse(
        urlopen('http://www.bilibili.com/robots.txt').read().decode(
            'utf-8').split('\n'))
    print(rp.can_fetch('*', 'http://www.bilibili.com/vide/BV15J411T7WQ'))
    print(
        rp.can_fetch(
            '*',
            'http://www.bilibili.com/search?q=python&page=1&type=collections'))
예제 #9
0
파일: rule.py 프로젝트: xlybaby/VAR
class CrawlController(object):
    def __init__(self):
        self._rp = RobotFileParser()

    def allow(self, p_robots_uri, p_target_uri):
        http = urllib3.PoolManager()
        r = http.request('GET', p_robots_uri)
        if r.data:
            self._rp.parse(r.data.decode('utf-8').splitlines())
            return rp.can_fetch('*', p_target_uri)
        return True
예제 #10
0
 def _robot_parser(self, txt, url):
     """Parses robots.txt with user-agent="*".
     :param txt: robots.txt to parse
     :param url: URL to check
     :returns: if url is allowed in robots.txt
     :rtype: bool
     """
     parser = RobotFileParser()
     if txt:
         parser.parse(txt.decode("ascii", "replace").splitlines())
         return parser.can_fetch("*", url)
     else:
         return True
예제 #11
0
def filter_googlebot(entries):
    """ Given a bunch of robots.txt entries, figure out if googlebot is allowed
    but other random bots are banned. yields tuples of (entry, reppy.Robots) objects
    that match this condition"""
    for entry in entries:
        if entry.status_code != 200:
            continue

        parser = RobotFileParser(entry.url)
        parser.parse(entry.body.split("\n"))

        if parser.can_fetch("GoogleBot", "/") and not parser.can_fetch("BensCoolBot", "/"):
            yield entry, parser
예제 #12
0
    def _get_robots(self, domain: Hyperlink) -> RobotFileParser:
        """get the robots.txt from any domain"""
        robots_url = domain.with_path("robots.txt")
        robots = RobotFileParser(str(robots_url))
        # try and get /robots.txt and parse except error we assume none
        try:
            resp = self._requester(robots_url, mime_types=("text/plain",))
            robots.parse(resp.text.splitlines())

        except (ClientError, ServerError, WrongMIMEType):
            robots.parse("")

        return robots
예제 #13
0
def filter_googlebot(entries):
    """ Given a bunch of robots.txt entries, figure out if googlebot is allowed
    but other random bots are banned. yields tuples of (entry, reppy.Robots) objects
    that match this condition"""
    for entry in entries:
        if entry.status_code != 200:
            continue

        parser = RobotFileParser(entry.url)
        parser.parse(entry.body.split("\n"))

        if parser.can_fetch("GoogleBot",
                            "/") and not parser.can_fetch("BensCoolBot", "/"):
            yield entry, parser
예제 #14
0
async def parse_robots(session, base):
    """Fetches and parses the robots.txt file from a given base URL. Returns an instance of
    RobotFileParser."""

    url = urljoin(base, "robots.txt")
    async with session.get(url) as response:
        status = response.status
        text = await response.text()
    robot_parser = RobotFileParser()
    if status == 200:
        robot_parser.parse(text.splitlines())
    else:
        robot_parser.allow_all = True
    return robot_parser
예제 #15
0
    def add_robot(self, base_url):

        resp = download(base_url, self.config, self.logger)

        if resp.raw_response is not None:
            robot_list = resp.raw_response.content.decode().split("\n")

        # Adds the robots.txt in a global dictionary, returning the read robot.txt
        if base_url not in self.robots:
            robots_file = RobotFileParser()
            if resp.raw_response is not None and resp.status != 404:
                robots_file.parse(robot_list)
            self.robots[base_url] = robots_file

        return self.robots[base_url]
예제 #16
0
def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      allow_errors=range(600))
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
        if req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())
    return rp
예제 #17
0
    def get_robots(self, url, download_handler):
        robots_url = self._get_robots_url(url)
        (robots_url_info, robots_url_content) = self._get_robots_content(
            robots_url, download_handler)

        if robots_url_content is None:
            return None

        content_type, ct_attrs = cgi.parse_header(
            robots_url_info['content_type'])
        charset = ct_attrs.get('charset', None)
        if charset is None or charset == '':
            charset = 'utf-8'

        rf_parser = RobotFileParser()
        rf_parser.parse(robots_url_content.decode(charset).splitlines())
        return rf_parser
예제 #18
0
class PythonRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from urllib.robotparser import RobotFileParser
        self.spider = spider
        robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
        self.rp = RobotFileParser()
        self.rp.parse(robotstxt_body.splitlines())

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.can_fetch(user_agent, url)
예제 #19
0
    def can_crawl(self, url):
        robots_path = self.get_robots_path(url)
        parser = RobotFileParser()

        if self.cache.get(robots_path) is None:
            robots_content = self.read_robots(robots_path)
            self.cache[robots_path] = robots_content
        else:
            robots_content = self.cache.get(robots_path)

        if robots_content is None:
            return True

        if robots_content is False:
            return False

        parser.parse(robots_content)
        return parser.can_fetch("*", url)
예제 #20
0
 def get_robots_parser(self, url: str):
     rp = RobotFileParser()
     if self.store.exists(url, 'txt'):
         body = self.store.load_url(url, 'txt')
     else:
         page, status_code = download_page(url, 'Robot')
         body = page.body
         if status_code in [401, 403]:
             body = self.DISALLOW_ALL
         elif 400 <= status_code < 500:  # including status_code 404
             body = self.ALLOW_ALL
         self.store.save_url(url, body, 'txt')
     if body.strip() == self.ALLOW_ALL:
         rp.allow_all = True
     elif body.strip() == self.DISALLOW_ALL:
         rp.disallow_all = True
     else:
         rp.parse(body.decode('utf-8').splitlines())
     return rp
예제 #21
0
파일: crawl.py 프로젝트: jbn/indie_crawl
def get_robots(es, domain):
    # Someone mentioned at Camp SF that they supported both http and https
    # for better coverage of older clients. This keeps that in mind.
    for protocol in ['https', 'http']:
        try:
            url = f"{protocol}://{domain}/robots.txt"
            doc = refresh_archive(es, url)[0]

            if doc['status_code'] == 200:
                lines = doc['content'].splitlines()
                robots = RobotFileParser(url=url)
                robots.parse(lines)
                return robots
            else:
                raise ValueError(f"Status code {doc['status_code']}")
        except KeyboardInterrupt:
            raise
        except Exception as e:
            LOGGER.warning(f"Unable to fetch {url}: {e}")

    return make_robots_allow_all()
예제 #22
0
def checkRobot(uri):
    parsedUrl = urlparse(uri)
    if parsedUrl.scheme == "" or parsedUrl.netloc == "":
        return None, None
    robotsUrl = str(parsedUrl.scheme) + "://" + str(
        parsedUrl.netloc) + "/robots.txt"
    try:
        req = requests.get(url=robotsUrl)
    except requests.exceptions.SSLError:
        return True, "SSL error"
    except requests.exceptions.ConnectionError:
        return True, "Connection Error"
    except requests.exceptions.InvalidSchema:
        return True, f"Invalid schema: {robotsUrl}"
    if req.status_code > 400:
        # if robots.txt is not accessible, we are allowed
        return True, None
    rp = RobotFileParser()
    rp.set_url(robotsUrl)
    rp.parse(req.text.split("\n"))
    if rp.can_fetch(archivoConfig.archivo_agent, uri):
        return True, None
    else:
        return False, "Not allowed"
예제 #23
0
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen

rp = RobotFileParser()
rp.parse(
    urlopen('https://cuiqingcai.com//robots.txt').read().decode('utf-8').split(
        '\n'))
print(rp.can_fetch('*', 'https://cuiqingcai.com/1052.html'))
예제 #24
0
from urllib.robotparser import RobotFileParser
from urllib.request  import urlopen

rp = RobotFileParser()
rp.parse(urlopen)
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))
# from urllib.robotparser import  RobotFileParser
#
# rp = RobotFileParser()
# rp.set_url('http://www.jianshu.com/robots.txt')
# rp.read()
# print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
# print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))

from urllib.robotparser import RobotFileParser
from urllib.request import urlopen

rp = RobotFileParser()
rp.parse(
    urlopen('https://blog.csdn.net/robots.txt').read().decode('utf-8').split(
        '\n'))
print(
    rp.can_fetch('*', 's://blog.csdn.net/Linear_Luo/article/details/52231550'))
예제 #26
0
#robots.txt  实例   爬虫协议   机器人协议
# User-agent:*
# Disallow:/
# Allow:/public/
# User-agent:Baiduspider
# set_url(),用来设置 robots.txt 文件的链接。如果已经在创建 RobotFileParser 对象时传入了链接,那就不需要再使用这个方法设置了。
# read(),读取 robots.txt 文件并进行分析,注意这个函数是执行一个读取和分析操作,如果不调用这个方法,接下来的判断都会为 False,所以一定记得调用这个方法,这个方法不会返回任何内容,但是执行了读取操作。
# parse(),用来解析 robots.txt 文件,传入的参数是 robots.txt 某些行的内容,它会按照 robots.txt 的语法规则来分析这些内容。
# can_fetch(),方法传入两个参数,第一个是 User-agent,第二个是要抓取的 URL,返回的内容是该搜索引擎是否可以抓取这个 URL,返回结果是 True 或 False。
# mtime(),返回的是上次抓取和分析 robots.txt 的时间,这个对于长时间分析和抓取的搜索爬虫是很有必要的,你可能需要定期检查来抓取最新的 robots.txt。
# modified(),同样的对于长时间分析和抓取的搜索爬虫很有帮助,将当前时间设置为上次抓取和分析 robots.txt 的时间。
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen
rp = RobotFileParser()
# rp.set_url('http://www.jianshu.com/robots.txt')
# rp.read()
# print(rp.can_fetch('*', 'https://www.jianshu.com/p/11046c89367d'))
# print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))
print(
    '----------------------------------------------------------------------------------'
)
rp.parse(
    urlopen('http://www.jianshu.com/robots.txt').read().decode('utf-8').split(
        '\n'))
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(
    rp.can_fetch(
        '*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #27
0
from urllib.robotparser import RobotFileParser

rp = RobotFileParser()
# 可以直接写url: rp = RobotFileParser('http://www.jianshu.com/robots.txt')
rp.set_url('http://www.jianshu.com/robots.txt')
# 读取robots.txt文件并进行分析,注意,这个方法执行一个读取和分析操作,如果不调用这个方法,接下来的判断都会为false
rp.read()
# 使用can_fetch方法判断网页是否可以被抓取
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(
    rp.can_fetch(
        '*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))
# 使用parse()方法执行读取和分析。
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen

rp_parse = RobotFileParser()
rp_parse.parse(
    urlopen('http://www.baidu.com/robots.txt').read().decode('utf-8').split(
        '\n'))
print(rp_parse.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(
    rp_parse.can_fetch(
        '*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))
예제 #28
0
def benchmark_python_parser(website):
    rp = RobotFileParser()
    rp.parse(website['robotstxt'].splitlines())
    for link in website['links']:
        rp.can_fetch('googlebot', link)
예제 #29
0
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen


def rootFileParser():
    rp = RobotFileParser()
    rp.set_url("http://www.jianshu.com/robots.txt")
    rp.read()
    print(rp.can_fetch("*", "https://www.jianshu.com/p/b67554025d7d"))
    print(
        rp.can_fetch(
            "*",
            "http://www.jianshu.com/search?q=python&page=1&type=collections"))


rootFileParser()

rp = RobotFileParser()
rp.parse(
    urlopen("http://www.jianshu.com/robots.txt").read().decode("utf-8").split(
        "\n"))
print(rp.can_fetch("*", "http://www.jianshu.com/p/b67554025d7d"))
print(
    rp.can_fetch(
        "*", "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #30
0
from urllib import request
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen

url = 'http://www.jianshu.com/robots.txt'
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
}

req = request.Request(url=url, headers=headers, method='GET')
robot_txt = urlopen(req).read().decode('utf-8').split('\n')
rp = RobotFileParser()
rp.parse(robot_txt)
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(
    rp.can_fetch(
        '*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #31
0
파일: 11 robots.py 프로젝트: joevers/python
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen

rp = RobotFileParser()
rp.set_url('https://www.jianshu.com/robots.txt')
# rp = RobotFileParser('https://www.jianshu.com/robots.txt')
rp.read()
print(rp.can_fetch('*','https://www.jianshu.com/p/b67554025d7d/'))
print(rp.can_fetch('*','https://www.jianshu.com'))

'''
rp = RobotFileParser()
rp.parse(urlopen('https://www.jianshu.com/robots.txt').read().decode('utf-8').split('\n'))

print(rp.can_fetch('*','https://www.jianshu.com/p/b67554025d7d/'))
print(rp.can_fetch('*','https://www.jianshu.com'))
'''
예제 #32
0
 def _get_robots_parser(additional_content: str = '') -> RobotFileParser:
     new_content = robots_content + additional_content
     robots = RobotFileParser()
     robots.parse(new_content.split('\n'))
     return robots
예제 #33
0
def robot_can_fetch(robots_txt_content, url):
    parser = RobotFileParser()
    parser.parse(robots_txt_content.splitlines())
    return parser.can_fetch(USER_AGENT, urlparse(url).path)
예제 #34
0
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen, Request
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

rp = RobotFileParser()
headers = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:67.0) Gecko/20100101 Firefox/67.0'
}
req = Request(url='https://www.jianshu.com/robots.txt', headers=headers)
rp.parse(urlopen(req).read().decode('utf-8').split('\n'))
print(rp.can_fetch('*', 'https://www.jianshu.com/p/b67554025d7d'))
print(
    rp.can_fetch(
        '*',
        "https://www.jianshu.com/search?q=python&page=1&type=collections"))
예제 #35
0
파일: crawler.py 프로젝트: danellis/cosmo
class Crawler(object):
    def __init__(self, database, fetcher, analyzer, verbose=False):
        self.database = database
        self.fetcher = fetcher
        self.analyzer = analyzer
        self.verbose = verbose
        self.queue = set()
        self.robot_parser = RobotFileParser()

    def crawl(self, url):
        """Begin recursively crawling pages starting from the given URL.

        :param url: Starting URL
        :returns: None
        """
        if self.database.is_page_stored(url):
            print("Page is already crawled. Use --flush to flush the database file.", file=sys.stderr)
        else:
            # Because crawling is restricted to pages on the same domain, the
            # robots.txt file can be loaded once at the beginning of the crawl
            self.load_robots_file(url)

            # Add the starting URL to the queue of pages to be crawled, and
            # then keep crawling while there are still URLs in the queue
            self.queue.add(url)
            while len(self.queue) > 0:
                self.crawl_one(self.queue.pop())

    def crawl_one(self, url):
        """Fetch a single page and analyze it for links. The found triples are
        stored in the database, and found links that should be crawled are
        added to the queue.

        :param url: The page to fetch and analyze
        :returns: None
        """
        if self.verbose:
            print(url, file=sys.stderr)

        status, html = self.fetcher.fetch(url)

        if status is None:
            # The status code will be None if retrieval failed
            print("Failed to get {}".format(url), file=sys.stderr)
        else:
            # Search for links and images in the page, and get them as triples
            # of (page URL, link type, link URL)
            triples = self.analyzer.analyze(url, html)

            self.database.store_triples(triples)

            # Any linked URLs that are eligible for crawling are added to the
            # pending crawl queue
            for page_url, link_type, link_url in triples:
                if self.should_crawl(page_url, link_type, link_url):
                    self.queue.add(link_url)

    def should_crawl(self, page_url, link_type, link_url):
        """Determine whether a URL should be crawled.

        :param page_url: The page the link came from.
        :param link_type: The type of link URL.
        :param link_url: The link URL to test.
        :returns: True if the link URL should be crawled, otherwise False.
        """
        # Only HTML pages should be crawled, not other media
        if link_type not in ('page', 'iframe'):
            return False

        # The link should be on the same domain as the page it's linked from
        if not self.have_same_domain(page_url, link_url):
            return False

        # Fetching the link URL should be permitted by robots.txt
        if not self.robot_parser.can_fetch('Cosmo', link_url):
            return False

        # The linked page should not have been crawled already
        if self.database.is_page_stored(link_url):
            return False

        return True

    def have_same_domain(self, url1, url2):
        """Test whether two URLs have the same hostname and port.

        :returns: True if they do, otherwise False
        """
        return urlparse(url1).netloc == urlparse(url2).netloc

    def load_robots_file(self, url):
        """Load the /robots.txt file for the given URL by reusing the scheme
        and authority parts.

        :param url: The URL from which to take the scheme and authority parts.
        :returns: None
        """
        # Create a new URL with the same scheme, host and port, but with a
        # path of /robots.txt
        parsed = urlparse(url)
        robots_url = urlunparse((parsed.scheme, parsed.netloc, '/robots.txt', '', '', ''))

        # Load the robots.txt file using the requests library, because we need
        # to specify the User-Agent header. I noticed on a CloudFlare-fronted
        # site that it returns a 403 for /robots.txt if the the user agent is
        # Python-urllib, but 200 if it's Cosmo.
        status, robots_file = self.fetcher.fetch(robots_url)
        if status in (401, 403):
            self.robot_parser.disallow_all = True
        elif status >= 400:
            self.robot_parser.allow_all = True
        else:
            self.robot_parser.parse(robots_file.splitlines())