def delete_job(ndb, job_id: str = None): """ CAUTION: there is no going back from a delete. So if you have a list `ndb.root.scrape_lists.lists['job_id_1'] with a 6 hour run of 50 scrape jobs in there. This will delete it entirely. So you may want to save the scrapes you want to keep. Move them to another list. In referring to `job_name` it is just an alias for a newt scrapes.list name. When a WorkerManager is initiated with a `job_name`, it will either create a new scrapes.list name or else just add all the jobs to the existing list if it matches the `job_name`. It turns out, deleting in newt is not straightforward. You can remove the class but the data remains in postgresql until the database is `packed`. `Packing` the database should be done periodically in production, based on a celery task cron job or similar method. To pack the database, all you need to do is run ndb.db().pack(days=<int()>). Usually, I use 3 days, ndb.db().pack(days=3). :param ndb: an instance of ndb. Please refer to examples/books_to_scrape/persistence/newt_db.py for an example. :return: a print statement """ try: del ndb.root.spiders.lists[job_id] ndb.commit() return logger.info(f'Deleted {job_id}') except KeyError: logger.info(f'Job-ID {job_id} does not exist.')
def monitor(self, target): """ This method actually spawns the scrapers and then it generally just serves as a hook point, where it allows some additional final actions to be performed on the scraper object after the worker completes the scrape job, but before it shuts down and the original object instance is lost. Here, we have access to the scrape results, and could do some final transformations or data checking. For example to to see the result and then update some results list. Or even, implement some other persistence method, if you don't want to use postgresql with newt.db. :param target: the target parameter here is a <Worker()> class object and you must call target.spawn_spider() to start the Worker. """ logger.info(f'spawning {target}') target.spawn_spider() # this must be called. It is, required. # Calling spawn_spider() above instructs the Worker object to start # the scrape.So there will be some wait period at this point for each # worker to actually run out of work and quit with a graceful shutdown. # Therefore, A GOOD SPOT TO HOOK SOME POST-SCRAPE LOGIC ON YOUR WORKERS # RESULTS, IS RIGHT HERE. For example, I've simply set `events = []` as a # class attribute on the BaseWorker <Worker> object and then appended # `self` to `events` after each scrape returns, as completed by the Worker. for event in target.events: # here, event represents returned scraper objects which the worker has # completed. We can iterate through the event objects and, for example, # apply some data transformation, delete failed scrapes, or save data logger.info(f'THIS IS A MONITOR EVENT - > {event}') # This last line is required, ensure the below gevent.sleep(0) remains. gevent.sleep(0)
def manage(self): """" Manage will hand out work when the appropriate Worker is free. The manager timeout must be less than worker timeout, or else, the workers will be idled and shutdown. """ try: while True: for name, workgroup in self.workgroups.items(): for qname, q in self.qitems.items(): if name == qname: # workgroup name must match tracker name # a tracker with the same name as workgroup name, is... # ...effectively, the workgroup's task queue, so now... # assign a task to a worker from the workgroup's task queue for worker in workgroup: one_task = q.get(timeout=self.mgr_qtimeout) worker.tasks.put(one_task) gevent.sleep(0) except Empty: self.mgr_no_work = True if self.mgr_should_stop: logger.info(f"Assigned all {name} work. I've been told I should stop.") self.should_stop = True else: logger.info(f"Assigned all {name} work. Awaiting more tasks to assign.")
def process_task(self, body, message): """ Process messages to extract the task keywords and then load them into a gevent Queue for each tracker. To customize how this Manger class works with the broker, this method should be a top consideration to override. Kwargs is not currently used. But it could be very useful to set logic flags for use in this method. """ keywords = body['keywords'] kwargs = body['kwargs'] logger.info(f'Got task: {reprcall(keywords)}') try: if isinstance(keywords, str): keywords = json.loads(keywords) for key in self.qitems.keys(): for item in keywords: self.qitems[key].put(item) if not self.mgr_should_stop: if self.mgr_no_work: gevent.spawn(self.manage).join() except Exception as exc: logger.error(f'task raised exception: {exc}') message.ack()
def get_job_results(ndb, job_id: str = None): """ Return a list of results of a job_id. :param ndb: an instance of ndb. Please refer to examples/books_to_scrape/persistence/newt_db.py for an example. :param job_id: the `job_id` assigned to the WorkerManager during the scrape. This is then used as the list name to save resulting objects to in newt.db. :return: [<SplashScraperData(('books.toscrape.com', 'soulsearcher'))>, ...] """ try: return ndb.root.spiders.lists[job_id].results except KeyError: logger.info(f'Job-ID {job_id} does not exist.')
def spawn_spider(self, **kwargs): """ Start and execute. """ try: while True: task = self.tasks.get( timeout=self.qtimeout) # decrements queue by 1 logger.info( f'Worker {self.name}-{self.number} got task {task}') spider = self.get_spider(task, **kwargs) spider.start_http_session(**self.http_session) # OK, right here is where we wait for the spider to return a result. self.result(spider, task) except Empty: logger.info(f'Quitting time for worker {self.name}-{self.number}!')
def pre_process_exports(self, spider, task): """ A hook point for customization before process_exports method is called. In this example, we use this method to save our spider data to postgresql using newt.db. :param spider: the Scraper or Crawler object (i.e. MouseKeyScraper()) :param task: just passing through the task item for printing. """ if self.job_id is not 'NONE': try: # create the list with the job name if it doesnt already exist ndb.root.spiders.add(self.job_id, SpiderList()) logger.info( f'Worker {self.name}-{self.number} created a new spider ' f'list for {self.job_id}') except KeyError: # will be raised if there is already a list with the same job_name pass # export the scraper data to the items object items = self.load_items(spider) # save the items object to newt.db ndb.root.spiders[self.job_id].add(items) ndb.commit() logger.info( f'Worker {self.name}-{self.number} saved {items.__repr__()} to ' f'scrape_list "{self.job_id}" for task {task}.') else: # if job_id is NONE then we'll skip saving the objects logger.info( f'Worker {self.name}-{self.number} said job_name is {self.job_id} ' f'so will not save it.')
def post_process_exports(self, spider, task): """ A hook point for customization after process_exports. In this example, we append the returned scraper object to a class attribute called `events`. """ self.events.append(spider) logger.info(f'{self.name} has {spider.stock} inventory status.') logger.info(f'pricing: {spider.price}') logger.info(f'Worker {self.name}-{self.number} finished task {task}')
# 4) Last, setup the Manager. You can constrain the number of workers actually # deployed, through the `pool` parameter. For example, this is useful # when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all # the workers concurrently, set the pool to be marginally larger than the number # of total workers assigned in groups in step #3 above. manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=10, connection=connection) if __name__ == "__main__": from kombu.utils.debug import setup_logging # setup root logger setup_logging(loglevel='INFO', loggers=['']) with Connection('amqp://*****:*****@localhost:5672//') as conn: try: manager.main() # call manager.main() to start the job. except KeyboardInterrupt: print('bye bye') # below shows an example of navigating your persisted data after the scrape result = get_job_results(ndb, 'books_scrape') logger.info(f'Printing: books_scrape result =>') if result: for r in result: logger.info(f"{r['book_title']}, {r['price']}, {r['stock']}") delete_job(ndb, 'books_scrape')
# if there is more than one tracker, use something like # the _publish above, with a for loop for each tracker routing_key=routing_key, exchange=exchange, declare=[exchange], ) if __name__ == '__main__': from kombu import Connection from kombu.utils.debug import setup_logging # setup root logger setup_logging(loglevel='INFO', loggers=['']) keyword_1 = '["Soumission"]' keyword_2 = '["Rip it Up and Start Again"]' keywords = '["Black Dust", "When We Collided"]' with Connection("pyamqp://*****:*****@localhost:5672//") as conn: send_as_task(conn, keywords=keyword_1, routing_key='books.toscrape.com', exchange= tasks.task_exchange, kwargs={}) logger.info(f'sent task {keyword_1}') send_as_task(conn, keywords=keyword_2, routing_key='books.toscrape.com', exchange= tasks.task_exchange, kwargs={}) logger.info(f'sent task {keyword_2}') send_as_task(conn, keywords=keywords, routing_key='books.toscrape.com', exchange= tasks.task_exchange, kwargs={}) logger.info(f'sent task {keywords}')