Пример #1
0
def delete_job(ndb, job_id: str = None):
    """
    CAUTION: there is no going back from a delete.

    So if you have a list `ndb.root.scrape_lists.lists['job_id_1'] with a
    6 hour run of 50 scrape jobs in there. This will delete it entirely.

    So you may want to save the scrapes you want to keep. Move them to
    another list.

    In referring to `job_name` it is just an alias for a newt scrapes.list name.

    When a WorkerManager is initiated with a `job_name`, it will
    either create a new scrapes.list name or else just add all the
    jobs to the existing list if it matches the `job_name`.

    It turns out, deleting in newt is not straightforward.  You can remove
    the class but the data remains in postgresql until the database is `packed`.

    `Packing` the database should be done periodically in production,
    based on a celery task cron job or similar method.  To pack the database,
    all you need to do is run ndb.db().pack(days=<int()>). Usually, I use 3 days,
    ndb.db().pack(days=3).

    :param ndb: an instance of ndb. Please refer to
    examples/books_to_scrape/persistence/newt_db.py for an example.

    :return: a print statement
    """
    try:
        del ndb.root.spiders.lists[job_id]
        ndb.commit()
        return logger.info(f'Deleted {job_id}')
    except KeyError:
        logger.info(f'Job-ID {job_id} does not exist.')
Пример #2
0
    def monitor(self, target):
        """
        This method actually spawns the scrapers and then it generally just serves
        as a hook point, where it allows some additional final actions to be
        performed on the scraper object after the worker completes the scrape job,
        but before it shuts down and the original object instance is lost.

        Here, we have access to the scrape results, and could do some final
        transformations or data checking. For example to to see the
        result and then update some results list. Or even, implement some other
        persistence method, if you don't want to use postgresql with newt.db.

        :param target: the target parameter here is a <Worker()> class object and
        you must call target.spawn_spider() to start the Worker.
        """
        logger.info(f'spawning {target}')
        target.spawn_spider()  # this must be called. It is, required.
        # Calling spawn_spider() above instructs the Worker object to start
        # the scrape.So there will be some wait period at this point for each
        # worker to actually run out of work and quit with a graceful shutdown.
        # Therefore, A GOOD SPOT TO HOOK SOME POST-SCRAPE LOGIC ON YOUR WORKERS
        # RESULTS, IS RIGHT HERE. For example, I've simply set `events = []` as a
        # class attribute on the BaseWorker <Worker> object and then appended
        # `self` to `events` after each scrape returns, as completed by the Worker.
        for event in target.events:
            # here, event represents returned scraper objects which the worker has
            # completed. We can iterate through the event objects and, for example,
            # apply some data transformation, delete failed scrapes, or save data
            logger.info(f'THIS IS A MONITOR EVENT - > {event}')
        # This last line is required, ensure the below gevent.sleep(0) remains.
        gevent.sleep(0)
Пример #3
0
 def manage(self):
     """"
     Manage will hand out work when the appropriate Worker is free.
     The manager timeout must be less than worker timeout, or else, the
     workers will be idled and shutdown.
     """
     try:
         while True:
             for name, workgroup in self.workgroups.items():
                 for qname, q in self.qitems.items():
                     if name == qname: # workgroup name must match tracker name
                         # a tracker with the same name as workgroup name, is...
                         # ...effectively, the workgroup's task queue, so now...
                         # assign a task to a worker from the workgroup's task queue
                         for worker in workgroup:
                             one_task = q.get(timeout=self.mgr_qtimeout)
                             worker.tasks.put(one_task)
             gevent.sleep(0)
     except Empty:
         self.mgr_no_work = True
         if self.mgr_should_stop:
             logger.info(f"Assigned all {name} work. I've been told I should stop.")
             self.should_stop = True
         else:
             logger.info(f"Assigned all {name} work. Awaiting more tasks to assign.")
Пример #4
0
    def process_task(self, body, message):
        """
        Process messages to extract the task keywords and then
        load them into a gevent Queue for each tracker.

        To customize how this Manger class works with the broker,
        this method should be a top consideration to override.

        Kwargs is not currently used. But it could be very useful
        to set logic flags for use in this method.
        """
        keywords = body['keywords']
        kwargs = body['kwargs']
        logger.info(f'Got task: {reprcall(keywords)}')
        try:
            if isinstance(keywords, str):
                keywords = json.loads(keywords)
            for key in self.qitems.keys():
                for item in keywords:
                    self.qitems[key].put(item)
            if not self.mgr_should_stop:
                if self.mgr_no_work:
                    gevent.spawn(self.manage).join()
        except Exception as exc:
            logger.error(f'task raised exception: {exc}')
        message.ack()
Пример #5
0
def get_job_results(ndb, job_id: str = None):
    """
    Return a list of results of a job_id.

    :param ndb: an instance of ndb. Please refer to
    examples/books_to_scrape/persistence/newt_db.py for an example.

    :param job_id: the `job_id` assigned to the WorkerManager during the scrape. This
    is then used as the list name to save resulting objects to in newt.db.

    :return: [<SplashScraperData(('books.toscrape.com', 'soulsearcher'))>, ...]
    """
    try:
        return ndb.root.spiders.lists[job_id].results
    except KeyError:
        logger.info(f'Job-ID {job_id} does not exist.')
Пример #6
0
 def spawn_spider(self, **kwargs):
     """
     Start and execute.
     """
     try:
         while True:
             task = self.tasks.get(
                 timeout=self.qtimeout)  # decrements queue by 1
             logger.info(
                 f'Worker {self.name}-{self.number} got task {task}')
             spider = self.get_spider(task, **kwargs)
             spider.start_http_session(**self.http_session)
             # OK, right here is where we wait for the spider to return a result.
             self.result(spider, task)
     except Empty:
         logger.info(f'Quitting time for worker {self.name}-{self.number}!')
Пример #7
0
    def pre_process_exports(self, spider, task):
        """
        A hook point for customization before process_exports method is
        called.

        In this example, we use this method to save our spider data to
        postgresql using newt.db.

        :param spider: the Scraper or Crawler object (i.e. MouseKeyScraper())
        :param task: just passing through the task item for printing.
        """
        if self.job_id is not 'NONE':
            try:
                # create the list with the job name if it doesnt already exist
                ndb.root.spiders.add(self.job_id, SpiderList())
                logger.info(
                    f'Worker {self.name}-{self.number} created a new spider '
                    f'list for {self.job_id}')
            except KeyError:
                # will be raised if there is already a list with the same job_name
                pass
            # export the scraper data to the items object
            items = self.load_items(spider)
            # save the items object to newt.db
            ndb.root.spiders[self.job_id].add(items)
            ndb.commit()
            logger.info(
                f'Worker {self.name}-{self.number} saved {items.__repr__()} to '
                f'scrape_list "{self.job_id}" for task {task}.')
        else:
            # if job_id is NONE then we'll skip saving the objects
            logger.info(
                f'Worker {self.name}-{self.number} said job_name is {self.job_id} '
                f'so will not save it.')
Пример #8
0
    def post_process_exports(self, spider, task):
        """
        A hook point for customization after process_exports.

        In this example, we append the returned scraper object to a
        class attribute called `events`.

        """
        self.events.append(spider)
        logger.info(f'{self.name} has {spider.stock} inventory status.')
        logger.info(f'pricing: {spider.price}')
        logger.info(f'Worker {self.name}-{self.number} finished task {task}')
Пример #9
0
# 4) Last, setup the Manager. You can constrain the number of workers actually
# deployed, through the `pool` parameter. For example, this is useful
# when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all
# the workers concurrently, set the pool to be marginally larger than the number
# of total workers assigned in groups in step #3 above.
manager = BooksWorkGroupManager('books_scrape',
                                tasks,
                                workgroups=groups,
                                pool=10,
                                connection=connection)

if __name__ == "__main__":

    from kombu.utils.debug import setup_logging
    # setup root logger
    setup_logging(loglevel='INFO', loggers=[''])
    with Connection('amqp://*****:*****@localhost:5672//') as conn:
        try:

            manager.main()  # call manager.main() to start the job.
        except KeyboardInterrupt:
            print('bye bye')
    # below shows an example of navigating your persisted data after the scrape

    result = get_job_results(ndb, 'books_scrape')
    logger.info(f'Printing: books_scrape result =>')
    if result:
        for r in result:
            logger.info(f"{r['book_title']}, {r['price']}, {r['stock']}")
        delete_job(ndb, 'books_scrape')
Пример #10
0
                         # if there is more than one tracker, use something like
                         # the _publish above, with a for loop for each tracker
                         routing_key=routing_key,
                         exchange=exchange,
                         declare=[exchange],
                         )


if __name__ == '__main__':

    from kombu import Connection
    from kombu.utils.debug import setup_logging

    # setup root logger
    setup_logging(loglevel='INFO', loggers=[''])

    keyword_1 = '["Soumission"]'
    keyword_2 = '["Rip it Up and Start Again"]'
    keywords = '["Black Dust", "When We Collided"]'

    with Connection("pyamqp://*****:*****@localhost:5672//") as conn:
        send_as_task(conn, keywords=keyword_1, routing_key='books.toscrape.com',
                     exchange= tasks.task_exchange, kwargs={})
        logger.info(f'sent task {keyword_1}')
        send_as_task(conn, keywords=keyword_2, routing_key='books.toscrape.com',
                     exchange= tasks.task_exchange, kwargs={})
        logger.info(f'sent task {keyword_2}')
        send_as_task(conn, keywords=keywords, routing_key='books.toscrape.com',
                     exchange= tasks.task_exchange, kwargs={})
        logger.info(f'sent task {keywords}')