def bts_book_manager(_BooksWorker): """ A BooksToScrape Manager test fixture for live network call. Here, we are spinning up two workers, while we have three tasks. It is important to test this as such, in spinning up a less number of workers vs total tasks. There are plenty of ways to break this test when refactoring. One likely source would be the BaseWorker class method `load_items`. It took me half-a-day to track down a bug in that method which resulted in this test only working if the # workers was equal to the number of tasks. That was the previous default way to run this test, so the bug went un-found. """ # first, setup newt.db for testing ndb.root._spiders = SpiderLists() ndb.commit() # ensure to open this file in binary mode book_data_file = open('c:/temp/book_data.csv', 'a+b') exporters = [ CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'], file=book_data_file, encoding='utf_8_sig') ] file = get_file_path('book_titles.xlsx') trackers = ['books.toscrape.com'] tasks = StatefulBook(file, trackers, keywords='titles', autorun=True) groups = [ WorkGroup( name='books.toscrape.com', url='http://books.toscrape.com/', spider=BooksToScrapeScraper, worker=_BooksWorker, items=BookItems, loader=BookItemsLoader, exporters=exporters, workers= 2, # this creates 2 scrapers and assigns each a book as a task kwargs={'timeout': (3.0, 20.0)}) ] manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=5) yield manager # teardown delete_job('books_scrape') del ndb.root._spiders ndb.commit()
def bts_broker_manager(_BooksToScrapeGroup, _BooksWorker, broker_tasks, broker_conn): """ A BooksToScrape Manager test fixture for live network call. Here, we use a broker (RabbitMQ) to test. """ # setup newt.db for testing ndb.root._spiders = SpiderLists() ndb.commit() # ensure to open this file in binary mode book_data_file = open('c:/temp/broker_data.csv', 'a+b') exporters = [ CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'], file=book_data_file, encoding='utf_8_sig') ] groups = [ WorkGroup( name='books.toscrape.com', url='http://books.toscrape.com/', spider=BooksToScrapeScraper, worker=_BooksWorker, items=BookItems, loader=BookItemsLoader, exporters=exporters, workers= 2, # this creates 2 scrapers and assigns each a book as a task kwargs={'timeout': (3.0, 20.0)}) ] manager = BooksWorkGroupManager('books_broker_scrape', broker_tasks, workgroups=groups, pool=5, connection=broker_conn) yield manager # teardown newt.db delete_job('books_broker_scrape') del ndb.root._spiders ndb.commit()
file=open('c:/temp/book_data.json', 'a+b'), encoding='utf_8_sig') ] # 4) Setup the WorkGroups. You can create an arbitrary number of WorkGroups in a list. # For example, if there are three different domains which you want to search for # the book titles from the excel file. If you wanted to scrape the price and stock data # on each of the three different websites for each book title. You could setup three # different WorkGroups here. Last, the WorkGroup.name should match the tracker name. groups = [ WorkGroup( name='books.toscrape.com', url='http://books.toscrape.com/', spider=BooksToScrapeScraper, worker=BooksWorker, items=BookItems, loader=BookItemsLoader, exporters=exporters, workers=3, # this creates 3 scrapers and assigns each a book as a task kwargs={'timeout': (3.0, 20.0)}) ] # 5) Last, setup the Manager. You can constrain the number of workers actually # deployed, through the `pool` parameter. For example, this is useful # when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all # the workers concurrently, set the pool +1 higher than the number of total # workers assigned in groups in step #3 above. The +1 is for pool manager. manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=5)