class Article(Entity): title = StringField('./h1/text()', xpath=True) content = StringField('./p/text()', xpath=True, all=True) authors = EntityField(Author, './div[@class="author"]', xpath=True, all=True)
class BookChapter(Entity): """An entity with various different field types.""" book = StringField('body>h1::text') number = IntField('.chnum::text') price = FloatField('.cost::text', re='\$(.+)') public = BoolField('p::text', re='Public: (.+)') updated = DateTimeField('p::text', re='Last updated on (.+).') next_url = StringField('#next::attr("href")', lower=True)
class SimpleContent(Entity): """A simple entity defined using XPath expressions.""" title = StringField('//div/h1/text()', xpath=True) link_text = StringField('//div/a/text()', xpath=True) link_url = StringField('//div/a/@href', xpath=True) urls = StringField('/html/body/div/a/@href', xpath=True, all=True, lower=True) in_divs = StringField('//div/div', xpath=True, all=True) process_title = Chain(six.text_type.capitalize, RAdd('!'))
class ArticleC(Entity): title = StringField('h1::text') content = StringField('p::text', all=True) authors = EntityField(AuthorC, 'div.author', all=True)
class AuthorC(Entity): firstname = StringField('span.firstname::text') lastname = StringField('span.lastname::text')
class Author(Entity): firstname = StringField('./span[@class="firstname"]/text()', xpath=True) lastname = StringField('./span[@class="lastname"]/text()', xpath=True)
class Blog(Entity): title = StringField('./h1/text()', xpath=True) content = StringField('./p/text()', xpath=True, all=True)