Пример #1
0
def bts_book_manager(_BooksWorker):
    """
    A BooksToScrape Manager test fixture for live network call.
    Here, we are spinning up two workers, while we have three
    tasks. It is important to test this as such, in spinning up
    a less number of workers vs total tasks.  There are plenty
    of ways to break this test when refactoring. One likely
    source would be the BaseWorker class method `load_items`.
    It took me half-a-day to track down a bug in that method
    which resulted in this test only working if the # workers
    was equal to the number of tasks. That was the previous
    default way to run this test, so the bug went un-found.
    """
    # first, setup newt.db for testing
    ndb.root._spiders = SpiderLists()
    ndb.commit()

    # ensure to open this file in binary mode
    book_data_file = open('c:/temp/book_data.csv', 'a+b')
    exporters = [
        CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'],
                        file=book_data_file,
                        encoding='utf_8_sig')
    ]

    file = get_file_path('book_titles.xlsx')
    trackers = ['books.toscrape.com']
    tasks = StatefulBook(file, trackers, keywords='titles', autorun=True)

    groups = [
        WorkGroup(
            name='books.toscrape.com',
            url='http://books.toscrape.com/',
            spider=BooksToScrapeScraper,
            worker=_BooksWorker,
            items=BookItems,
            loader=BookItemsLoader,
            exporters=exporters,
            workers=
            2,  # this creates 2 scrapers and assigns each a book as a task
            kwargs={'timeout': (3.0, 20.0)})
    ]
    manager = BooksWorkGroupManager('books_scrape',
                                    tasks,
                                    workgroups=groups,
                                    pool=5)

    yield manager

    # teardown
    delete_job('books_scrape')
    del ndb.root._spiders
    ndb.commit()
Пример #2
0
def bts_broker_manager(_BooksToScrapeGroup, _BooksWorker, broker_tasks,
                       broker_conn):
    """
    A BooksToScrape Manager test fixture for live network call.
    Here, we use a broker (RabbitMQ) to test.
    """
    # setup newt.db for testing
    ndb.root._spiders = SpiderLists()
    ndb.commit()

    # ensure to open this file in binary mode
    book_data_file = open('c:/temp/broker_data.csv', 'a+b')
    exporters = [
        CsvItemExporter(fields_to_export=['book_title', 'stock', 'price'],
                        file=book_data_file,
                        encoding='utf_8_sig')
    ]

    groups = [
        WorkGroup(
            name='books.toscrape.com',
            url='http://books.toscrape.com/',
            spider=BooksToScrapeScraper,
            worker=_BooksWorker,
            items=BookItems,
            loader=BookItemsLoader,
            exporters=exporters,
            workers=
            2,  # this creates 2 scrapers and assigns each a book as a task
            kwargs={'timeout': (3.0, 20.0)})
    ]
    manager = BooksWorkGroupManager('books_broker_scrape',
                                    broker_tasks,
                                    workgroups=groups,
                                    pool=5,
                                    connection=broker_conn)

    yield manager

    # teardown newt.db
    delete_job('books_broker_scrape')
    del ndb.root._spiders
    ndb.commit()
Пример #3
0
                          file=open('c:/temp/book_data.json', 'a+b'),
                          encoding='utf_8_sig')
]

# 4) Setup the WorkGroups. You can create an arbitrary number of WorkGroups in a list.
# For example, if there are three different domains which you want to search for
# the book titles from the excel file. If you wanted to scrape the price and stock data
# on each of the three different websites for each book title. You could setup three
# different WorkGroups here. Last, the WorkGroup.name should match the tracker name.

groups = [
    WorkGroup(
        name='books.toscrape.com',
        url='http://books.toscrape.com/',
        spider=BooksToScrapeScraper,
        worker=BooksWorker,
        items=BookItems,
        loader=BookItemsLoader,
        exporters=exporters,
        workers=3,  # this creates 3 scrapers and assigns each a book as a task
        kwargs={'timeout': (3.0, 20.0)})
]

# 5) Last, setup the Manager. You can constrain the number of workers actually
# deployed, through the `pool` parameter. For example, this is useful
# when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all
# the workers concurrently, set the pool +1 higher than the number of total
# workers assigned in groups in step #3 above. The +1 is for pool manager.
manager = BooksWorkGroupManager('books_scrape',
                                tasks,
                                workgroups=groups,
                                pool=5)