Exemplo n.º 1
0
    def test_generosity(self):
        robotstxt_incorrect = """
        Foo: Foobot
        Bar: /"""
        rp = Protego.parse(content=robotstxt_incorrect)
        self.assertTrue(rp.can_fetch('http://foo.bar/x/y', 'FooBot'))

        robotstxt_incorrect_accepted = """
        user-agent foobot
        disallow /
        user agent harry potter 
        disallow /horcrux
        request rate 1/10s 1820-1940
        """
        rp = Protego.parse(content=robotstxt_incorrect_accepted)
        self.assertFalse(rp.can_fetch('http://foo.bar/x/y', 'FooBot'))

        self.assertFalse(rp.can_fetch('http://foo.bar/horcrux',
                                      'harry potter'))
        self.assertTrue(rp.can_fetch('http://foo.bar/abc', 'harry potter'))
        req_rate = rp.request_rate('harry potter')
        self.assertTrue(req_rate.requests == 1)
        self.assertTrue(req_rate.seconds == 10)
        self.assertTrue(req_rate.start_time.hour == 18)
        self.assertTrue(req_rate.start_time.minute == 20)
        self.assertTrue(req_rate.end_time.hour == 19)
        self.assertTrue(req_rate.end_time.minute == 40)

        wildcards_in_user_agent = """
        user-agent: foo*bot
        disallow: /myprofile
        """
        rp = Protego.parse(content=wildcards_in_user_agent)
        self.assertFalse(rp.can_fetch('http://foo.bar/myprofile', 'foo*bot'))
        self.assertFalse(rp.can_fetch('http://foo.bar/myprofile', 'foobot'))
Exemplo n.º 2
0
    def test_comments(self):
        content = """
        # comment 1
        User-Agent: one # comment 2
        # comment 3
        User-Agent: two
        Disallow: /one-two-bot
        # Disallow: /commented
        # comment 4
        User-Agent: *
        Disallow: /default-ua
        """
        rp = Protego.parse(content=content)
        self.assertFalse(rp.can_fetch("https://site.local/one-two-bot", "one"))
        self.assertFalse(rp.can_fetch("https://site.local/one-two-bot", "two"))
        self.assertTrue(rp.can_fetch("https://site.local/commented", "one"))
        self.assertTrue(rp.can_fetch("https://site.local/commented", "two"))
        self.assertTrue(rp.can_fetch("https://site.local/default-ua", "one"))
        self.assertTrue(rp.can_fetch("https://site.local/default-ua", "two"))

        content = ("User-agent: FooBot\n"
                   "# Disallow: /\n"
                   "Disallow: /foo/quz#qux\n"
                   "Allow: /\n")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("http://foo.bar/foo/bar", "FooBot"))
        self.assertFalse(rp.can_fetch("http://foo.bar/foo/quz", "FooBot"))
Exemplo n.º 3
0
    def test_percentage_encoding(self):
        content = ("User-agent: FooBot\n"
                   "Disallow: /\n"
                   "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n")
        rp = Protego.parse(content=content)
        self.assertTrue(
            rp.can_fetch(
                "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par",
                "FooBot"))

        content = ("User-agent: FooBot\n"
                   "Disallow: /\n"
                   u"Allow: /foo/bar/ツ\n")
        rp = Protego.parse(content=content)
        self.assertTrue(
            rp.can_fetch("http://foo.bar/foo/bar/%E3%83%84", "FooBot"))
        self.assertTrue(rp.can_fetch(u"http://foo.bar/foo/bar/ツ", "FooBot"))

        content = ("User-agent: FooBot\n"
                   "Disallow: /\n"
                   "Allow: /foo/bar/%E3%83%84\n")
        rp = Protego.parse(content=content)
        self.assertTrue(
            rp.can_fetch("http://foo.bar/foo/bar/%E3%83%84", "FooBot"))
        self.assertTrue(rp.can_fetch(u"http://foo.bar/foo/bar/ツ", "FooBot"))

        content = ("User-agent: FooBot\n"
                   "Disallow: /\n"
                   "Allow: /foo/bar/%62%61%7A\n")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("http://foo.bar/foo/bar/baz", "FooBot"))
        self.assertTrue(
            rp.can_fetch("http://foo.bar/foo/bar/%62%61%7A", "FooBot"))
Exemplo n.º 4
0
    def test_url_case_sensitivity(self):
        content = ("user-agent: FooBot\n" "disallow: /x/\n")
        rp = Protego.parse(content=content)
        self.assertFalse(rp.can_fetch("http://foo.bar/x/y", "FooBot"))

        content = ("user-agent: FooBot\n" "disallow: /X/\n")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("http://foo.bar/x/y", "FooBot"))
Exemplo n.º 5
0
    def test_unicode_url_and_useragent(self):
        content = u"""
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/
        # taken from https://en.wikipedia.org/robots.txt
        Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
        Disallow: /wiki/Käyttäjä:
        Disallow: /wiki/Keskustelu_käyttäjästä:
        User-Agent: UnicödeBöt
        Disallow: /some/randome/page.html"""
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("https://site.local/", "*"))
        self.assertFalse(rp.can_fetch("https://site.local/admin/", "*"))
        self.assertFalse(rp.can_fetch("https://site.local/static/", "*"))
        self.assertTrue(
            rp.can_fetch("https://site.local/admin/", u"UnicödeBöt"))
        self.assertFalse(
            rp.can_fetch("https://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:",
                         "*"))
        self.assertFalse(
            rp.can_fetch(u"https://site.local/wiki/Käyttäjä:", "*"))
        self.assertFalse(
            rp.can_fetch(
                u"https://site.local/wiki/Keskustelu_k%C3%A4ytt%C3%A4j%C3%A4st%C3%A4:",
                "*"))
        self.assertFalse(
            rp.can_fetch(u"https://site.local/wiki/Keskustelu_käyttäjästä:",
                         "*"))
        self.assertTrue(
            rp.can_fetch("https://site.local/some/randome/page.html", "*"))
        self.assertFalse(
            rp.can_fetch("https://site.local/some/randome/page.html",
                         u"UnicödeBöt"))

        content = u"""
        # robots.txt for http://www.example.com/
        
        User-Agent: Jävla-Foobot
        Disallow: /
        
        User-Agent: \u041b\u044c\u0432\u0456\u0432-bot
        Disallow: /totalitarianism/
        
        """
        rp = Protego.parse(content=content)

        self.assertTrue(rp.can_fetch("/foo/bar.html", u"jävla-fanbot"))
        self.assertFalse(rp.can_fetch("/foo/bar.html", u"jävla-foobot"))
        self.assertTrue(rp.can_fetch("/", "foobot"))

        self.assertTrue(
            rp.can_fetch("/", u"Mozilla/5.0 (compatible; Львів-bot/1.1)"))
        self.assertFalse(
            rp.can_fetch("/totalitarianism/foo.htm",
                         u"Mozilla/5.0 (compatible; Львів-bot/1.1)"))
Exemplo n.º 6
0
    def test_unescaped_url(self):
        content = ("User-agent: * \n" "Disallow: / \n" "Allow: /a<d.html")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("https://www.site.local/a<d.html", "*"))
        self.assertTrue(rp.can_fetch("https://www.site.local/a%3cd.html", "*"))

        content = ("User-agent: * \n" "Disallow: / \n" "Allow: /a<*")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("https://www.site.local/a<d.html", "*"))
        self.assertTrue(rp.can_fetch("https://www.site.local/a%3cd.html", "*"))
Exemplo n.º 7
0
def test_no_exceptions(path_to_robotstxt):
    try:
        with open(join(test_data_directory, path_to_robotstxt), 'rb') as f:
            try:
                content = f.read().decode('utf-8')
            except UnicodeDecodeError:
                # Downloaded robots.txt is malformed, ignore this
                return
            Protego.parse(content=content)
    except Exception as e:
        assert False, "{}. Exception raised while parsing {}".format(e, join(path_to_robotstxt, 'robots.txt'))
Exemplo n.º 8
0
    def test_allowed(self):
        content = ("User-agent: * \n"
                   "Disallow: /disallowed \n"
                   "Allow: /allowed \n"
                   "Crawl-delay: 10")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("https://www.site.local/allowed", "*"))
        self.assertFalse(rp.can_fetch("https://www.site.local/disallowed",
                                      "*"))

        content = ("User-agent: * \n" "Disallow: /d \n" "Crawl-delay: 10")
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("https://www.site.local/abc/d", "*"))
        self.assertFalse(rp.can_fetch("https://www.site.local/disallowed",
                                      "*"))
Exemplo n.º 9
0
    def test_user_agent_grouping(self):
        content = """
        User-Agent: one
        User-Agent: two
        Disallow: /success
        User-Agent: *
        Disallow: /failure
        """
        rp = Protego.parse(content=content)
        self.assertFalse(rp.can_fetch("https://site.local/success", "one"))
        self.assertFalse(rp.can_fetch("https://site.local/success", "two"))
        self.assertTrue(rp.can_fetch("https://site.local/failure", "one"))
        self.assertTrue(rp.can_fetch("https://site.local/failure", "two"))

        content = (
            "allow: /foo/bar/\n"
            "\n"
            "user-agent: FooBot\n"
            "disallow: /\n"
            "allow: /x/\n"
            "user-agent: BarBot\n"
            "disallow: /\n"
            "allow: /y/\n"
            "\n"
            "\n"
            "allow: /w/\n"
            "user-agent: BazBot\n"
            "\n"
            "user-agent: FooBot\n"
            "allow: /z/\n"
            "disallow: /\n")
        url_w = "http://foo.bar/w/a"
        url_x = "http://foo.bar/x/b"
        url_y = "http://foo.bar/y/c"
        url_z = "http://foo.bar/z/d"
        url_foo = "http://foo.bar/foo/bar/"
        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch(url_x, "FooBot"))
        self.assertTrue(rp.can_fetch(url_z, "FooBot"))
        self.assertFalse(rp.can_fetch(url_y, "FooBot"))
        self.assertTrue(rp.can_fetch(url_y, "BarBot"))
        self.assertTrue(rp.can_fetch(url_w, "BarBot"))
        self.assertFalse(rp.can_fetch(url_z, "BarBot"))
        self.assertTrue(rp.can_fetch(url_z, "BazBot"))

        self.assertFalse(rp.can_fetch(url_foo, 'FooBot'))
        self.assertFalse(rp.can_fetch(url_foo, 'BarBot'))
        self.assertFalse(rp.can_fetch(url_foo, 'BazBot'))
Exemplo n.º 10
0
 def test_empty_response(self):
     """empty response should equal 'allow all'"""
     rp = Protego.parse(content='')
     self.assertTrue(rp.can_fetch("https://site.local/", "*"))
     self.assertTrue(rp.can_fetch("https://site.local/", "chrome"))
     self.assertTrue(rp.can_fetch("https://site.local/index.html", "*"))
     self.assertTrue(rp.can_fetch("https://site.local/disallowed", "*"))
Exemplo n.º 11
0
 def test_malformed_crawl_delay(self):
     content = ("User-agent: * \n"
                "Disallow: /disallowed \n"
                "Allow: /allowed \n"
                "Crawl-delay: random_word")
     rp = Protego.parse(content=content)
     self.assertTrue(rp.crawl_delay('*') is None)
Exemplo n.º 12
0
def test_path_matching(pattern, path, match):
    content = """
    User-Agent: *
    disallow: {}
    """.format(pattern)
    rp = Protego.parse(content)
    assert (not rp.can_fetch(path, '*')) == match
Exemplo n.º 13
0
def test_record_precedence(rules, url, allowed):
    content = """
    User-Agent: *
    {}
    """.format(rules)
    rp = Protego.parse(content)
    assert rp.can_fetch(url, '*') == allowed
Exemplo n.º 14
0
 def test_sitemaps_come_first(self):
     """Some websites have sitemaps before any robots directives"""
     content = ("Sitemap: https://www.foo.bar/sitmap.xml\n"
                "User-Agent: FootBot\n"
                "Disallow: /something")
     rp = Protego.parse(content=content)
     self.assertEquals(list(rp.sitemaps), ["https://www.foo.bar/sitmap.xml"])
Exemplo n.º 15
0
    def test_with_absolute_urls(self):
        content = ("user-agent: *\n"
                   "disallow: http://ms-web00.walkerplus.com/\n")

        rp = Protego.parse(content=content)
        self.assertTrue(rp.can_fetch("http://foo.bar/", "FooBot"))
        self.assertFalse(rp.can_fetch("http://foo.bar/http://ms-web00.walkerplus.com/", "FooBot"))
Exemplo n.º 16
0
 def test_no_sitemaps(self):
     content = ("User-agent: * \n"
                "Disallow: /disallowed \n"
                "Allow: /allowed \n"
                "Crawl-delay: 10")
     rp = Protego.parse(content=content)
     self.assertTrue(not list(rp.sitemaps))
Exemplo n.º 17
0
 def test_no_preferred_host(self):
     content = ("User-agent: * \n"
                "Disallow: /disallowed \n"
                "Allow: /allowed \n"
                "Crawl-delay: 10")
     rp = Protego.parse(content=content)
     self.assertTrue(rp.preferred_host is None)
Exemplo n.º 18
0
 def test_1994rfc_example(self):
     """Test parser on examples form 1994 RFC."""
     content = """
     # robots.txt for http://www.example.com/
     User-agent: *
     Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     Disallow: /tmp/ # these will soon disappear
     Disallow: /foo.html
     """
     rp = Protego.parse(content=content)
     user_agent = "CrunchyFrogBot"
     self.assertTrue(rp.can_fetch("/", user_agent))
     self.assertFalse(rp.can_fetch("/foo.html", user_agent))
     self.assertTrue(rp.can_fetch("/foo.htm", user_agent))
     self.assertTrue(rp.can_fetch("/foo.shtml", user_agent))
     self.assertFalse(rp.can_fetch("/foo.htmlx", user_agent))
     self.assertTrue(rp.can_fetch("/cyberworld/index.html", user_agent))
     self.assertFalse(rp.can_fetch("/tmp/foo.html", user_agent))
     # Since it is the caller's responsibility to make sure the host name
     # matches, the parser disallows foo.html regardless of what I pass for
     # host name and protocol.
     self.assertFalse(
         rp.can_fetch("http://example.com/foo.html", user_agent))
     self.assertFalse(
         rp.can_fetch("http://www.example.com/foo.html", user_agent))
     self.assertFalse(
         rp.can_fetch("http://www.example.org/foo.html", user_agent))
     self.assertFalse(
         rp.can_fetch("https://www.example.org/foo.html", user_agent))
     self.assertFalse(rp.can_fetch("ftp://example.net/foo.html",
                                   user_agent))
Exemplo n.º 19
0
    def test_url_parts(self):
        content = ("User-agent: * \n" "Disallow: /path;params?query \n")
        rp = Protego.parse(content=content)
        self.assertFalse(
            rp.can_fetch(
                "http://[email protected]:10/path;params?query#fragment",
                "*"))

        content = ("User-agent: * \n" "Disallow: /? \n")
        rp = Protego.parse(content=content)
        self.assertFalse(rp.can_fetch("/?query", "*"))
        self.assertTrue(rp.can_fetch('/', '*'))

        content = ("User-agent: * \n" "Disallow: /; \n")
        rp = Protego.parse(content=content)
        self.assertFalse(rp.can_fetch("/;params", "*"))
        self.assertTrue(rp.can_fetch('/', '*'))
    async def _lauch_browser(self) -> None:
        self._pw = await async_playwright().start()
        pwOptions = self._settingsdict['PLAYWRIGHT_LAUNCH_OPTIONS']
        if not "".__eq__(self._settingsdict['PLAYWRIGHT_BROWSER_TYPE']):
            if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] not in ['chromium', 'firefox', 'webkit']:
                raise RuntimeError(
                    'Invalid PLAYWRIGHT_BROWSER_TYPE configuration')

            if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'chromium':
                self._browser = await self._pw.chromium.launch(**pwOptions)
            elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'firefox':
                self._browser = await self._pw.firefox.launch(**pwOptions)
            elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'webkit':
                self._browser = await self._pw.webkit.launch(**pwOptions)
        # If no Cookies path is provided, storage state is still returned, but won't be saved to the disk
        self._context = await self._browser.new_context(user_agent=self._settingsdict['USER_AGENT'], storage_state=self._settingsdict['COOKIES_PATH'])

        self._context.set_default_navigation_timeout(
            self._settingsdict['PLAYWRIGHT_NAVIGATION_TIMEOUT'])

        blankPage = await self._context.new_page()
        getUA = await blankPage.evaluate('''() => {
          return navigator.userAgent
        }''')

        # Load robots.txt file
        response = await blankPage.goto(urlparse(self.base_url).scheme + '://' + urlparse(self.base_url).netloc + '/robots.txt')
        if response.ok:
            try:
                text = await blankPage.inner_text('pre')
            except TimeoutError as e:
                print('my print', e)
                text = await blankPage.inner_text('body')
            text = text + self._settingsdict['CUSTOM_ROBOT']
            res = Protego.parse(text)

            self._robotsTxt = res
        else:
            self._robotsTxt = Protego.parse("""
          User-agent: *
          Allow: /
          """)

        self.crawllogger.info(
            '[000] Starting browser with User Agent: {}'.format(getUA))
Exemplo n.º 21
0
 def test_index_html_is_directory(self):
     content = ("User-Agent: *\n"
                "Allow: /allowed-slash/index.html\n"
                "Disallow: /\n")
     rp = Protego.parse(content=content)
     self.assertTrue(rp.can_fetch("http://foo.com/allowed-slash/", "footbot"))
     self.assertTrue(rp.can_fetch("http://foo.com/allowed-slash/index.html", "footbot"))
     self.assertFalse(rp.can_fetch("http://foo.com/allowed-slash/index.htm", "footbot"))
     self.assertFalse(rp.can_fetch("http://foo.com/anyother-url", "footbot"))
Exemplo n.º 22
0
 def test_empty_record_group(self):
     content = """
     User-Agent: harrybot
     Disallow: /
     User-Agent: testbot
     """
     rp = Protego.parse(content=content)
     self.assertTrue(rp.can_fetch("https://site.local/path1", "testbot"))
     self.assertTrue(rp.can_fetch("https://site.local/path2", "testbot"))
Exemplo n.º 23
0
 def test_special_symbols_dual_behaviour(self):
     '''Special symbols such as * and $, should also be treated as an ordinary character'''
     content = ("user-agent: FooBot\n"
                "disallow: /x/abc$\n"
                "disallow: /x*x/abc\n")
     rp = Protego.parse(content=content)
     self.assertFalse(rp.can_fetch("http://foo.bar/x*x/abc", "FooBot"))
     self.assertFalse(rp.can_fetch("http://foo.bar/x/abc$", "FooBot"))
     self.assertFalse(rp.can_fetch("http://foo.bar/x/abc%24", "FooBot"))
Exemplo n.º 24
0
 def test_no_request_rate(self):
     content = """
     User-agent: one
     Request-rate: 1/10s
     User-agent: two
     Disallow: /
     """
     rp = Protego.parse(content=content)
     self.assertTrue(rp.request_rate('two') is None)
Exemplo n.º 25
0
 def test_crawl_delay(self):
     content = ("User-agent: * \n"
                "Disallow: /disallowed \n"
                "Allow: /allowed \n"
                "Crawl-delay: 10 \n"
                "User-agent: testbot\n"
                "Crawl-delay: 15 \n")
     rp = Protego.parse(content=content)
     self.assertTrue(rp.crawl_delay('*') == 10.0)
     self.assertTrue(rp.crawl_delay('testbot') == 15.0)
Exemplo n.º 26
0
    async def _lauch_browser(self) -> None:
        self._pw = await async_playwright().start()
        pwOptions = self._settingsdict['PLAYWRIGHT_LAUNCH_OPTIONS']
        if not "".__eq__(self._settingsdict['PLAYWRIGHT_BROWSER_TYPE']):
            if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] not in [
                    'chromium', 'firefox', 'webkit'
            ]:
                raise RuntimeError(
                    'Invalid PLAYWRIGHT_BROWSER_TYPE configuration')

            if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'chromium':
                self._browser = await self._pw.chromium.launch(**pwOptions)
            elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'firefox':
                self._browser = await self._pw.firefox.launch(**pwOptions)
            elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'webkit':
                self._browser = await self._pw.webkit.launch(**pwOptions)

        self._context = await self._browser.newContext()
        self._context.setDefaultNavigationTimeout(
            self._settingsdict['PLAYWRIGHT_NAVIGATION_TIMEOUT'])

        blankPage = await self._context.newPage()
        getUA = await blankPage.evaluate('''() => {
          return navigator.userAgent
        }''')

        # Load robots.txt file
        response = await blankPage.goto(
            urlparse(self.base_url).scheme + '://' +
            urlparse(self.base_url).netloc + '/robots.txt')
        if response.ok:
            text = await blankPage.innerText('pre')
            self._robotsTxt = Protego.parse(text)
        else:
            self._robotsTxt = Protego.parse("""
          User-agent: *
          Allow: /
          """)

        self.crawllogger.info(
            '[000] Starting browser with User Agent: {}'.format(getUA))
Exemplo n.º 27
0
 def test_escaped_special_symbols(self):
     '''Percent encoded special symbols should be treated as ordinary characters.'''
     content = ("user-agent: FooBot\n"
                "disallow: /x/abc%24\n"
                "disallow: /x%2Ax/abc\n")
     rp = Protego.parse(content=content)
     self.assertFalse(rp.can_fetch("http://foo.bar/x/abc$abc", "FooBot"))
     self.assertFalse(rp.can_fetch("http://foo.bar/x/abc$", "FooBot"))
     self.assertTrue(rp.can_fetch("http://foo.bar/x/abc", "FooBot"))
     self.assertFalse(rp.can_fetch("http://foo.bar/x*x/abc", "FooBot"))
     self.assertFalse(rp.can_fetch("http://foo.bar/x*x/abcdef", "FooBot"))
     self.assertTrue(rp.can_fetch("http://foo.bar/xabcx/abc", "FooBot"))
Exemplo n.º 28
0
def robotstxt_test(robotstxt_url, user_agents, urls):
    """Given a :attr:`robotstxt_url` check which of the :attr:`user_agents` is
    allowed to fetch which of the :attr:`urls`.

    All the combinations of :attr:`user_agents` and :attr:`urls` will be
    checked and the results returned in one DataFrame.

    >>> robotstxt_test('https://facebook.com/robots.txt',
    ...                user_agents=['*', 'Googlebot', 'Applebot'],
    ...                urls=['/', '/bbc', '/groups', '/hashtag/'])
                          robotstxt_url user_agent   url_path  can_fetch
    0   https://facebook.com/robots.txt          *          /      False
    1   https://facebook.com/robots.txt          *       /bbc      False
    2   https://facebook.com/robots.txt          *    /groups      False
    3   https://facebook.com/robots.txt          *  /hashtag/      False
    4   https://facebook.com/robots.txt   Applebot          /       True
    5   https://facebook.com/robots.txt   Applebot       /bbc       True
    6   https://facebook.com/robots.txt   Applebot    /groups       True
    7   https://facebook.com/robots.txt   Applebot  /hashtag/      False
    8   https://facebook.com/robots.txt  Googlebot          /       True
    9   https://facebook.com/robots.txt  Googlebot       /bbc       True
    10  https://facebook.com/robots.txt  Googlebot    /groups       True
    11  https://facebook.com/robots.txt  Googlebot  /hashtag/      False

    :param url robotstxt_url: The URL of robotx.txt file
    :param str,list user_agents: One or more user agents
    :param str,list urls: One or more paths (relative) or URLs (absolute) to
                           check
    :return DataFrame robotstxt_test_df:
    """
    if not robotstxt_url.endswith('/robots.txt'):
        raise ValueError('Please make sure you enter a valid robots.txt URL')
    if isinstance(user_agents, str):
        user_agents = [user_agents]
    if isinstance(urls, str):
        urls = [urls]
    robots_open = urlopen(Request(robotstxt_url, headers=headers))
    robots_bytes = robots_open.readlines()
    robots_text = ''.join(line.decode() for line in robots_bytes)
    rp = Protego.parse(robots_text)

    test_list = []
    for path, agent in product(urls, user_agents):
        d = dict()
        d['user_agent'] = agent
        d['url_path'] = path
        d['can_fetch'] = rp.can_fetch(path, agent)
        test_list.append(d)
    df = pd.DataFrame(test_list)
    df.insert(0, 'robotstxt_url', robotstxt_url)
    df = df.sort_values(['user_agent', 'url_path']).reset_index(drop=True)
    return df
Exemplo n.º 29
0
 def test_skip_malformed_line(self):
     content = """
     User-Agent: one
     Disallow: /bot-one
     Harry Potter
     User-Agent: two
     Disallow: /bot-two
     """
     rp = Protego.parse(content=content)
     self.assertFalse(rp.can_fetch("https://site.local/bot-one", "one"))
     self.assertTrue(rp.can_fetch("https://site.local/bot-two", "one"))
     self.assertFalse(rp.can_fetch("https://site.local/bot-two", "two"))
     self.assertTrue(rp.can_fetch("https://site.local/bot-one", "two"))
Exemplo n.º 30
0
 def test_skip_unknown_directives(self):
     content = """
     User-Agent: one
     Disallow: /bot-one
     Harry: Potter
     User-Agent: two
     Disallow: /bot-two
     """
     rp = Protego.parse(content=content)
     self.assertFalse(rp.can_fetch("https://site.local/bot-one", "one"))
     self.assertTrue(rp.can_fetch("https://site.local/bot-two", "one"))
     self.assertFalse(rp.can_fetch("https://site.local/bot-two", "two"))
     self.assertTrue(rp.can_fetch("https://site.local/bot-one", "two"))