Пример #1
0
def test_backend(backend):

    # Graph
    graph = graphs.Manager()
    graph.add_site_list(SITE_LIST)

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    print '-'*80
    print frontier.backend.name
    print '-'*80

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Пример #2
0
    def __init__(self):
        def log(msg):
            print "Test Manager: ", msg

        self.logger = TestManager.Nothing()
        self.settings = Settings()
        self.logger.backend = TestManager.Nothing()
        for log_level in ('info' 'debug', 'warning', 'error'):
            setattr(self.logger.backend, log_level, log)
Пример #3
0
 def get_settings(self):
     """
     Returns backend settings
     """
     return Settings(
         attributes={
             'BACKEND': self.backend_class,
             'STRATEGY': 'tests.backends.BasicCrawlingStrategy'
         })
Пример #4
0
def test_backend(backend):

    # Graph
    graph = graphs.Manager()
    graph.add_site_list(SITE_LIST)

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    print '-' * 80
    print frontier.backend.name
    print '-' * 80

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Пример #5
0
def test_overused():
    settings = Settings(attributes={'OVERUSED_BATCH_DELAY': 2})

    backend = Backend(settings)
    backend.frontier_start()
    backend.set_overused(0, ['a'])
    assert backend.get_overused_for_batch([0, 1]) == {0: {'a'}, 1: set()}

    backend.set_overused(0, ['a', 'b'])
    assert backend.get_overused_for_batch([0, 1]) == {0: {'a', 'b'}, 1: set()}
    backend.set_overused(1, ['a'])
    assert backend.get_overused_for_batch([0, 1]) == {0: {'b'}, 1: {'a'}}
    assert backend.get_overused_for_batch([0, 1]) == {0: set(), 1: {'a'}}
    assert backend.get_overused_for_batch([0, 1]) == {0: set(), 1: set()}
Пример #6
0
def test_logic(backend):
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    settings.TEST_MODE = True
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run(add_all_pages=True)

    # Show crawling sequence
    print('-' * 80)
    print(frontier.backend.name)
    print('-' * 80)
    for page in tester.sequence:
        print(page.url)
Пример #7
0
def test_logic(backend):
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    settings.TEST_MODE = True
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run(add_all_pages=True)

    # Show crawling sequence
    print '-'*80
    print frontier.backend.name
    print '-'*80
    for page in tester.sequence:
        print page.url
Пример #8
0

import re

import requests

from frontera.contrib.requests.manager import RequestsFrontierManager
from frontera import Settings

from six.moves.urllib.parse import urljoin


SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.MAX_REQUESTS = 100
SETTINGS.MAX_NEXT_REQUESTS = 10

SEEDS = [
    'http://www.imdb.com',
]

LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)


def extract_page_links(response):
    return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]

if __name__ == '__main__':
Пример #9
0
 def get_settings(self):
     """
     Returns backend settings
     """
     return Settings(attributes={'BACKEND': self.backend_class})
Пример #10
0
from grequests import AsyncRequest, get as grequests_get, map as grequests_map

from frontera.core.models import Request as FrontierRequest
from frontera.utils.converters import BaseRequestConverter
from frontera.contrib.requests.converters import ResponseConverter

from frontera.utils.managers import FrontierManagerWrapper
from frontera.core import get_slot_key
from frontera import Settings

from six import iteritems
from six.moves.urllib.parse import urljoin


SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = False
SETTINGS.MAX_REQUESTS = 0
SETTINGS.MAX_NEXT_REQUESTS = 40

SEEDS = [
    'http://www.imdb.com',
    'http://www.bbc.com/',
    'http://www.amazon.com/'
]

LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)

Пример #11
0
"""
Frontier tester using recording data
"""
from frontera import FrontierManager, FrontierTester, Settings, graphs

SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = False


if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db')

    # Frontier
    frontier = FrontierManager.from_settings(SETTINGS)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show frontier pages
    print '-'*80
    print ' Frontier pages'
    print '-'*80
    for page in frontier.backend.pages.values():
        print page.url, page.depth, page.state
Пример #12
0
import re

import requests

from urlparse import urljoin

from frontera.contrib.requests.manager import RequestsFrontierManager
from frontera import Settings

SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.MAX_REQUESTS = 100
SETTINGS.MAX_NEXT_REQUESTS = 10

SEEDS = [
    'http://www.imdb.com',
]

LINK_RE = re.compile(r'href="(.*?)"')


def extract_page_links(response):
    return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]

if __name__ == '__main__':

    frontier = RequestsFrontierManager(SETTINGS)
    frontier.add_seeds([requests.Request(url=url) for url in SEEDS])
    while True:
Пример #13
0
"""
Frontier tester using recording data
"""
from frontera import FrontierManager, FrontierTester, Settings, graphs

SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = False

if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db')

    # Frontier
    frontier = FrontierManager.from_settings(SETTINGS)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show frontier pages
    print('-' * 80)
    print(' Frontier pages')
    print('-' * 80)
    for page in list(frontier.backend.pages.values()):
        print(page.url, page.depth, page.state)
Пример #14
0
"""
Frontier initialization from settings
"""
from frontera import FrontierManager, Settings, graphs, Request, Response

SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = True
SETTINGS.TEST_MODE = True

if __name__ == '__main__':
    # Create graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Create frontier from settings
    frontier = FrontierManager.from_settings(SETTINGS)

    # Add seeds
    frontier.add_seeds([Request(seed.url) for seed in graph.seeds])

    # Get next requests
    next_requests = frontier.get_next_requests()

    # Crawl pages
    for request in next_requests:

        # Fake page crawling
        crawled_page = graph.get_page(request.url)
Пример #15
0
"""
Frontier tester usage example
"""
from frontera import FrontierManager, FrontierTester, Settings, graphs

if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.TEST_MODE = True
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Пример #16
0
"""
Frontier tester usage example
"""
from frontera import FrontierManager, FrontierTester, Settings, graphs

if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.TEST_MODE = True
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print(page.url)