Exemplo n.º 1
0
    def write(self):
        """
        Write your scraper's exported custom data attributes to the
        BookScraperItems class which will be persisted in the database.

        Call super() to also capture attributes built-in from the Base classes.

        Last, ensure you assign the attributes to `self.items` and also finally
        you must return self.items in this method!
        """

        # now, define your custom items
        self.items['book_title'] = self.spider.book_title
        self.items['stock'] = self.spider.stock
        # set the value with self.serialize_field(field, name, value) as needed,
        # for example, `serialize_price` below turns '£50.10' into 'UK £50.10'
        # the '£50.10' is the original scraped value from the website stored in
        # self.scraper.price, but we think it is more clear as 'UK £50.10'
        self.items['price'] = self.serialize_field(
            field=Field(serializer=serialize_price),
            name='price',
            value=self.spider.price)

        # call super() to write the built-in Items from BaseItemExporter
        super().write()

        return self.items
Exemplo n.º 2
0
        class TestItem(Item):
            name = Field()

            def get_name(self):
                return self['name']

            def change_name(self, name):
                self['name'] = name
Exemplo n.º 3
0
class BookItems(newt.db.Persistent, SplashScraperItems):
    """
    A class object which will encapsulate the data from the scraper
    object. It will itself be persisted in PostgreSQL and also
    further serialized to a JSONB field, which is automatically
    indexed for fast search queries.  The caveat is, the data encapsulated
    in this class object must all be pickleable. The main items we deal
    with which are not pickleable are beautifulsoup4 objects.

    Don't try to persist a beautifulsoup4 object in postgres with newt.db.
    To avoid issues, ensure that the result from a beautifulsoup4 object
    is cast to string. Wrapping it with str() will avoid issues.
    """
    # -- names of your customized scraper class attributes go here -- #

    book_title = Field()  # str() # the book_title which we searched
    price = Field()  # the self.price attribute
    stock = Field()  # the self.stock attribute
Exemplo n.º 4
0
 class TestItem(Item):
     name = Field()
Exemplo n.º 5
0
 class TestItem(Item):
     name = Field()
     number = Field()
Exemplo n.º 6
0
 class C(object):
     fields = {'load': Field(default='C')}
     not_allowed = Field(default='not_allowed')
     save = Field(default='C')
Exemplo n.º 7
0
 class A(Item):
     fields = {'load': Field(default='A')}
     save = Field(default='A')
Exemplo n.º 8
0
 class E(C, B):
     load = Field(default='E')
Exemplo n.º 9
0
 class TestItem(Item):
     name = Field()
     keys = Field()
     values = Field()
Exemplo n.º 10
0
 class C(A):
     fields = {'update': Field(default='C')}
     save = Field(default='C')
Exemplo n.º 11
0
 class A(Item):
     fields = {'update': Field(default='A')}
     save = Field(default='A')
     load = Field(default='A')
Exemplo n.º 12
0
 class C(Item):
     fields = {'load': Field(default='C')}
     save = Field(default='C')
Exemplo n.º 13
0
 class TestItem(BaseItem):
     keys = Field()
Exemplo n.º 14
0
 class BaseItem(Item):
     name = Field()
     keys = Field()
     values = Field()
Exemplo n.º 15
0
 class TestItem(Item):
     fields = {'new': Field(default='X')}
Exemplo n.º 16
0
class SplashScraperItems(Item):
    """
    A base class which should be inherited in a subclass which then implements
    a customized SplashScraperContainer instance. It declares the standard
    data attributes returned by a SplashScraper object, with Field().  Declaring
    with Field() enables a useful data container to be created for the attribute.
    """

    # -- the Scraper's browser class -> self.browser data -- #

    # self.browser.raw_content
    raw_content = Field()
    # this is the status code received from splash (NOT THE ENDPOINT)
    # self.browser.status
    status = Field()
    # self.browser.get_current_request() -> <PreparedRequest [POST]>
    current_request = Field()
    # self.browser.get_current_url() -> 'http://localhost:8050/execute'
    current_url = Field()
    # flag for a python-request timeout error which should mean there was
    # some network problem or reason second number in the timeout tuple like
    # (3.0, 700.0)  was not long enough.
    timeout_exception = Field()

    encoding = Field()
    ucontent = Field()
    resp_content = Field()
    resp_content_type_header = Field()
    resp_headers = Field()
    har = Field()
    png = Field()
    endpoint_status = Field()
    crawlera_session = Field()
    html = Field()

    # /end self.browser

    # -- splash scraper class attributes -- #

    # get the __repr__()
    scraper_repr = Field()
    name = Field()
    number = Field(
    )  # str() or int(), `number` is used to enumerate individual workers
    # the specially prepared self.cookies which would need set by us
    cookies = Field()
    splash_args = Field()
    http_session_valid = Field()
    baseurl = Field()
    crawlera_user = Field()
    referrer = Field()
    searchurl = Field()
    LUA_SOURCE = Field()
    _test_true = Field()
    _result = Field()
Exemplo n.º 17
0
 class D(B, C):
     fields = {'update': Field(default='D')}
     load = Field(default='D')