Пример #1
0
def test_cocrawler_reppy():
    r1 = Robots.parse(
        'http://example.com/robots.txt', '''
User-Agent: foo
Allow: /
# comment
Disallow: /
Disallow: /disallowed
''')
    r2 = Robots.parse(
        'http://example.com/robots.txt', '''
User-Agent: foo
Allow: /

Disallow: /
Disallow: /disallowed
''')
    r3 = Robots.parse(
        '', '''
User-Agent: foo
Allow: /

Disallow: /
Disallow: /disallowed
''')

    # despite the blank line or comment, 'foo' is disllowed from disallowed
    assert r1.allowed('/', 'foo') is True
    assert r1.allowed('/disallowed', 'foo') is False
    assert r2.allowed('/', 'foo') is True
    assert r2.allowed('/disallowed', 'foo') is False
    assert r3.allowed('/', 'foo') is True
    assert r3.allowed('/disallowed', 'foo') is False

    # blank line does not reset user-agent to *, so bar has no rules
    assert r1.allowed('/', 'bar') is True
    assert r1.allowed('/disallowed', 'bar') is True
    assert r2.allowed('/', 'bar') is True
    assert r2.allowed('/disallowed', 'bar') is True
    assert r3.allowed('/', 'bar') is True
    assert r3.allowed('/disallowed', 'bar') is True

    # no substring weirdnesses, so foobar does not match foo rules
    assert r1.allowed('/', 'foobar') is True
    assert r1.allowed('/disallowed', 'foobar') is True
    assert r2.allowed('/', 'foobar') is True
    assert r2.allowed('/disallowed', 'foobar') is True
    assert r3.allowed('/', 'foobar') is True
    assert r3.allowed('/disallowed', 'foobar') is True
Пример #2
0
def test_cocrawler_reppy_xfail():
    r4 = Robots.parse('', '''
User-agent: *
Disallow: //
''')

    # ibm.com, I'm looking at you
    assert r4.allowed('/foo', '') is True
    assert r4.allowed('/', '') is True
Пример #3
0
def getRobotParser(loader, startUrl):
    robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
    page = loader.get(robotUrl, allow_redirects=True)

    if page is None:
        print("Could not read ROBOTS.TXT at: " + robotUrl)
        return None
    #end if

    rp = Robots.parse(robotUrl, page)
    print("Found ROBOTS.TXT at: " + robotUrl)
    return rp
Пример #4
0
def getRobotParser(startUrl):
    robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
    page, _, _ = getPage(robotUrl)

    if page is None:
        print("Could not read ROBOTS.TXT at: " + robotUrl)
        return None
    #end if

    rp = Robots.parse(robotUrl, page)
    print("Found ROBOTS.TXT at: " + robotUrl)
    # return rp
    return None
Пример #5
0
 def __init__(self, robotstxt_body, spider):
     from reppy.robots import Robots
     self.spider = spider
     self.rp = Robots.parse('', robotstxt_body)
Пример #6
0
 def parse(self, content, name):
     '''Parse the robots.txt in content and return the agent of the provided name.'''
     return Robots.parse('http://example.com', content).agent(name)
Пример #7
0
def benchmark_reppy_parser(website):
    from reppy.robots import Robots
    rp = Robots.parse('', website['robotstxt'])
    for link in website['links']:
        rp.allowed(link, 'googlebot')
Пример #8
0
 def parse(self, content, name):
     '''Parse the robots.txt in content and return the agent of the provided name.'''
     return Robots.parse('http://example.com', content).agent(name)
Пример #9
0
    def parse_robots_txt(self, link_list):
        host, port = self.config.cache_server
        robotsURL = ''
        robots = None
        links = []
        for link_url in link_list:
            parsed_link = parse.urlparse(link_url)
            link_base = '{0.scheme}://{0.netloc}/'.format(parsed_link)
            if robots == None or link_base not in robotsURL:
                if 'today.uci.edu' in link_base:
                    robots = Robots.parse('https://today.uci.edu/department/information_computer_sciences/robots.txt', '''
                    User-agent: *
                    Disallow: /*/calendar/*?*types*
                    Disallow: /*/browse*?*types*
                    Disallow: /*/calendar/200*
                    Disallow: /*/calendar/2015*
                    Disallow: /*/calendar/2016*
                    Disallow: /*/calendar/2017*
                    Disallow: /*/calendar/2018*
                    Disallow: /*/calendar/2019*
                    Disallow: /*/calendar/202*
                    Disallow: /*/calendar/week
                    
                    Disallow: /*/search
                    Disallow: /*?utm
                    
                    Allow: /
                    Allow: /*/search/events.ics
                    Allow: /*/search/events.xml
                    Allow: /*/calendar/ics
                    Allow: /*/calendar/xml
                    ''')
                else:
                    robotsURL = link_base + 'robots.txt'
                    time.sleep(0.5)
                    # get the robots.txt file
                    try:
                        robots = Robots.fetch(f"http://{host}:{port}/", params=[("q", f"{robotsURL}"), ("u", f"{self.config.user_agent}")], timeout=20)
                    except Exception as e:
                        print(e)
                        robots = None

                    # WARNING: UNCOMMENTING BYPASSES CACHE

                    # if the robots is empty, get the robots.txt from actual server
                    # robots_str = str(robots)
                    # robots_str = robots_str.split(': ')[1].split('}')[0]
                    # if robots_str == '[]':
                    #     robots = Robots.fetch(robotsURL, timeout=20)
                    #     print(robots)
            if robots == None:
                links.append(link_url)
                continue
            if parsed_link.params == '':
                if parsed_link.query == '':
                    query_only = '{0.path}/'.format(parsed_link)
                else:
                    query_only = '{0.path}/?{0.query}'.format(parsed_link)
            else:
                if parsed_link.query == '':
                    query_only = '{0.path}/{0.params}/'.format(parsed_link)
                else:
                    query_only = '{0.path}/{0.params}/?{0.query}'.format(parsed_link)
            if robots.allowed(query_only, self.config.user_agent):
                links.append(link_url)
        return links
Пример #10
0
Allow: /serv
Allow: /~mak
Disallow: /
'''


@contextmanager
def timer(name, count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print(name)
        print('=' * 10)
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
        print('')


with timer('Parse', 100000) as count:
    for _ in xrange(count):
        Robots.parse('http://example.com/robots.txt', content)

parsed = Robots.parse('http://example.com/robots.txt', content)
with timer('Evaluate', 100000) as count:
    for _ in xrange(count):
        parsed.allowed('/org/example.html', 'other-bot')
Пример #11
0
Allow: /serv
Allow: /~mak
Disallow: /
'''

@contextmanager
def timer(name, count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print(name)
        print('=' * 10)
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
        print('')


with timer('Parse', 100000) as count:
    for _ in xrange(count):
        Robots.parse('http://example.com/robots.txt', content)


parsed = Robots.parse('http://example.com/robots.txt', content)
with timer('Evaluate', 100000) as count:
    for _ in xrange(count):
        parsed.allowed('/org/example.html', 'other-bot')