Пример #1
0
def test_get_robots_parser_not_from_root():
    responses.add(responses.HEAD, 'http://test.com/robots.txt', status=200)
    request.urlopen = lambda url: UrlOpenMock(url)

    rp = crawl.get_robots_parser_if_exists('http://test.com/test')
    assert rp is not None

    rp = crawl.get_robots_parser_if_exists('test.com/test/test1')
    assert rp is not None
Пример #2
0
def api_page():
    """
    API endpoint for requesting page data. https://www.statsify.us/api

    :param url: the url to gather data from.
    :type url: str
    :return: if a successful request is made, the data will be returned in
        a JSON object under the 'data' key; otherwise, a JSON object
        with an 'error' key will be returned.
    :rtype: object
    """

    if 'url' not in request.args:
        response = jsonify({'error': ERROR_MESSAGES[2]})
        response.headers.add('Access-Control-Allow-Origin', '*')
        return response

    rp = crawl.get_robots_parser_if_exists(request.args['url'])

    page = Page(request.args['url'], rp)

    if page.html is None:
        response = jsonify({'error': page.error})
        response.headers.add('Access-Control-Allow-Origin', '*')
        return response

    response = jsonify({'data': page})
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Пример #3
0
def test_get_robots_parser_if_exists():
    responses.add(responses.HEAD, 'http://test.com/robots.txt', status=200)
    request.urlopen = lambda url: UrlOpenMock(
        url, text='User-agent: *\nAllow: /test\nDisallow: /')

    rp = crawl.get_robots_parser_if_exists('http://test.com/')
    assert rp.can_fetch('*', 'http://test.com/test')
    assert not rp.can_fetch('*', 'http://test.com')
    assert not rp.can_fetch('*', 'http://test.com/other')
Пример #4
0
def test_page_disallowed():
    responses.add(responses.GET,
                  'http://test.com/test',
                  content_type='text/html',
                  body='<p>hi</p>')
    responses.add(responses.HEAD, 'http://test.com/robots.txt', status=200)

    request.urlopen = lambda url: helpers.UrlOpenMock(
        url, text='User-agent: *\nAllow: /test/test\nDisallow: /test')
    rp = crawl.get_robots_parser_if_exists('http://test.com')

    page = Page('http://test.com/test', rp)
    assert page.rp is rp
    assert page.html is None
    assert hasattr(page, 'error')
    assert page.error == helpers.ERROR_MESSAGES[0]
Пример #5
0
    def __init__(self, url, generate_depth=1):
        """
        Creates a Website object to represent the contents of multiple pages within
        a website. If the given url is valid and able to be requested, self.pages,
        self.text, and other instance variables will contain various statistics about
        the website; otherwise, self.error describes what went wrong.

        :param url: the url of the website.
        :type url: str
        :param generate_depth: the maximum recursive depth to crawl through the website.
        :type generate_depth: int
        """

        rp = crawl.get_robots_parser_if_exists(url)
        self.root = PageNode(Page(url, rp), generate_depth=generate_depth)

        if self.root.page.html is None:
            self.error = self.root.page.error
        else:
            self.pages = {
                self.root.page.url: {
                    'page': self.root.page,
                    'freq': 0
                }
            }
            self.text = ''
            self.total_word_count = 0

            self.outbound_links = set(self.root.page.outbound_links)

            self.traverse_all_pages()

            divisor = len(self.pages)
            if '*' in self.pages:
                divisor -= 1
            self.average_word_count = self.total_word_count / divisor
            self.key_phrases = helpers.get_key_phrases_from_text(self.text,
                                                                 max_length=3)
Пример #6
0
def test_get_robots_parser_if_does_not_exist():
    responses.add(responses.HEAD, 'http://test.com/robots.txt', status=404)
    request.urlopen = lambda url: UrlOpenMock(url)

    rp = crawl.get_robots_parser_if_exists('http://test.com')
    assert rp is None