def test_backend(backend): # Graph graph = graphs.Manager() graph.add_site_list(SITE_LIST) # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) print '-'*80 print frontier.backend.name print '-'*80 # Tester tester = FrontierTester(frontier, graph) tester.run() # Show crawling sequence for page in tester.sequence: print page.url
def __init__(self): def log(msg): print "Test Manager: ", msg self.logger = TestManager.Nothing() self.settings = Settings() self.logger.backend = TestManager.Nothing() for log_level in ('info' 'debug', 'warning', 'error'): setattr(self.logger.backend, log_level, log)
def get_settings(self): """ Returns backend settings """ return Settings( attributes={ 'BACKEND': self.backend_class, 'STRATEGY': 'tests.backends.BasicCrawlingStrategy' })
def test_backend(backend): # Graph graph = graphs.Manager() graph.add_site_list(SITE_LIST) # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) print '-' * 80 print frontier.backend.name print '-' * 80 # Tester tester = FrontierTester(frontier, graph) tester.run() # Show crawling sequence for page in tester.sequence: print page.url
def test_overused(): settings = Settings(attributes={'OVERUSED_BATCH_DELAY': 2}) backend = Backend(settings) backend.frontier_start() backend.set_overused(0, ['a']) assert backend.get_overused_for_batch([0, 1]) == {0: {'a'}, 1: set()} backend.set_overused(0, ['a', 'b']) assert backend.get_overused_for_batch([0, 1]) == {0: {'a', 'b'}, 1: set()} backend.set_overused(1, ['a']) assert backend.get_overused_for_batch([0, 1]) == {0: {'b'}, 1: {'a'}} assert backend.get_overused_for_batch([0, 1]) == {0: set(), 1: {'a'}} assert backend.get_overused_for_batch([0, 1]) == {0: set(), 1: set()}
def test_logic(backend): # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False settings.TEST_MODE = True frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) tester.run(add_all_pages=True) # Show crawling sequence print('-' * 80) print(frontier.backend.name) print('-' * 80) for page in tester.sequence: print(page.url)
def test_logic(backend): # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False settings.TEST_MODE = True frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) tester.run(add_all_pages=True) # Show crawling sequence print '-'*80 print frontier.backend.name print '-'*80 for page in tester.sequence: print page.url
import re import requests from frontera.contrib.requests.manager import RequestsFrontierManager from frontera import Settings from six.moves.urllib.parse import urljoin SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.MAX_REQUESTS = 100 SETTINGS.MAX_NEXT_REQUESTS = 10 SEEDS = [ 'http://www.imdb.com', ] LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I) def extract_page_links(response): return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)] if __name__ == '__main__':
def get_settings(self): """ Returns backend settings """ return Settings(attributes={'BACKEND': self.backend_class})
from grequests import AsyncRequest, get as grequests_get, map as grequests_map from frontera.core.models import Request as FrontierRequest from frontera.utils.converters import BaseRequestConverter from frontera.contrib.requests.converters import ResponseConverter from frontera.utils.managers import FrontierManagerWrapper from frontera.core import get_slot_key from frontera import Settings from six import iteritems from six.moves.urllib.parse import urljoin SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = False SETTINGS.MAX_REQUESTS = 0 SETTINGS.MAX_NEXT_REQUESTS = 40 SEEDS = [ 'http://www.imdb.com', 'http://www.bbc.com/', 'http://www.amazon.com/' ] LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)
""" Frontier tester using recording data """ from frontera import FrontierManager, FrontierTester, Settings, graphs SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = False if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db') # Frontier frontier = FrontierManager.from_settings(SETTINGS) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show frontier pages print '-'*80 print ' Frontier pages' print '-'*80 for page in frontier.backend.pages.values(): print page.url, page.depth, page.state
import re import requests from urlparse import urljoin from frontera.contrib.requests.manager import RequestsFrontierManager from frontera import Settings SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.MAX_REQUESTS = 100 SETTINGS.MAX_NEXT_REQUESTS = 10 SEEDS = [ 'http://www.imdb.com', ] LINK_RE = re.compile(r'href="(.*?)"') def extract_page_links(response): return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)] if __name__ == '__main__': frontier = RequestsFrontierManager(SETTINGS) frontier.add_seeds([requests.Request(url=url) for url in SEEDS]) while True:
""" Frontier tester using recording data """ from frontera import FrontierManager, FrontierTester, Settings, graphs SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = False if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db') # Frontier frontier = FrontierManager.from_settings(SETTINGS) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show frontier pages print('-' * 80) print(' Frontier pages') print('-' * 80) for page in list(frontier.backend.pages.values()): print(page.url, page.depth, page.state)
""" Frontier initialization from settings """ from frontera import FrontierManager, Settings, graphs, Request, Response SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = True SETTINGS.TEST_MODE = True if __name__ == '__main__': # Create graph graph = graphs.Manager('sqlite:///data/graph.db') # Create frontier from settings frontier = FrontierManager.from_settings(SETTINGS) # Add seeds frontier.add_seeds([Request(seed.url) for seed in graph.seeds]) # Get next requests next_requests = frontier.get_next_requests() # Crawl pages for request in next_requests: # Fake page crawling crawled_page = graph.get_page(request.url)
""" Frontier tester usage example """ from frontera import FrontierManager, FrontierTester, Settings, graphs if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.TEST_MODE = True settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show crawling sequence for page in tester.sequence: print page.url
""" Frontier tester usage example """ from frontera import FrontierManager, FrontierTester, Settings, graphs if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.TEST_MODE = True settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show crawling sequence for page in tester.sequence: print(page.url)