def delete_job(job_id): """ A ndb helper method that manipulates the _scraper object. """ try: del ndb.root._spiders.lists[job_id] ndb.commit() except KeyError: pass
def bts_book_manager(_BooksWorker): """ A BooksToScrape Manager test fixture for live network call. Here, we are spinning up two workers, while we have three tasks. It is important to test this as such, in spinning up a less number of workers vs total tasks. There are plenty of ways to break this test when refactoring. One likely source would be the BaseWorker class method `load_items`. It took me half-a-day to track down a bug in that method which resulted in this test only working if the # workers was equal to the number of tasks. That was the previous default way to run this test, so the bug went un-found. """ # first, setup newt.db for testing ndb.root._spiders = SpiderLists() ndb.commit() # ensure to open this file in binary mode book_data_file = open('c:/temp/book_data.csv', 'a+b') exporters = [ CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'], file=book_data_file, encoding='utf_8_sig') ] file = get_file_path('book_titles.xlsx') trackers = ['books.toscrape.com'] tasks = StatefulBook(file, trackers, keywords='titles', autorun=True) groups = [ WorkGroup( name='books.toscrape.com', url='http://books.toscrape.com/', spider=BooksToScrapeScraper, worker=_BooksWorker, items=BookItems, loader=BookItemsLoader, exporters=exporters, workers= 2, # this creates 2 scrapers and assigns each a book as a task kwargs={'timeout': (3.0, 20.0)}) ] manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=5) yield manager # teardown delete_job('books_scrape') del ndb.root._spiders ndb.commit()
def bts_broker_manager(_BooksToScrapeGroup, _BooksWorker, broker_tasks, broker_conn): """ A BooksToScrape Manager test fixture for live network call. Here, we use a broker (RabbitMQ) to test. """ # setup newt.db for testing ndb.root._spiders = SpiderLists() ndb.commit() # ensure to open this file in binary mode book_data_file = open('c:/temp/broker_data.csv', 'a+b') exporters = [ CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'], file=book_data_file, encoding='utf_8_sig') ] groups = [ WorkGroup( name='books.toscrape.com', url='http://books.toscrape.com/', spider=BooksToScrapeScraper, worker=_BooksWorker, items=BookItems, loader=BookItemsLoader, exporters=exporters, workers= 2, # this creates 2 scrapers and assigns each a book as a task kwargs={'timeout': (3.0, 20.0)}) ] manager = BooksWorkGroupManager('books_broker_scrape', broker_tasks, workgroups=groups, pool=5, connection=broker_conn) yield manager # teardown newt.db delete_job('books_broker_scrape') del ndb.root._spiders ndb.commit()
def pre_process_exports(self, spider, task): if self.job_id is not 'NONE': try: # create the list with the job name if it doesnt already exist ndb.root._spiders.add(self.job_id, SpiderList()) print( f'Worker {self.name}-{self.number} created a new scrape_list ' f'for {self.job_id}') except KeyError: # will be raised if there is already a list with the same job_name pass # export the scraper data to the items object items = self.load_items(spider) # save the items object to newt.db ndb.root._spiders[self.job_id].add(items) ndb.commit() print( f'Worker {self.name}-{self.number} saved {items.__repr__()} to ' f'scrape_list "{self.job_id}" for task {task}.') else: # if job_id is NONE then we'll skip saving the objects print( f'Worker {self.name}-{self.number} said job_name is {self.job_id} ' f'so will not save it.')