class NestedOuterPage(PageModel): model_class = dict page_tree = Html( StrictNode("div.outer")( Node("> span")(outertxt=Text()), Node("div.inner")(nested=NestedInnerPage()), ))
class SimplePage(PageModel): model_class = dict page_tree = Html( Node("body")( Node("div.div_1")(div1=Text()), Node("#div_2", "#asdf")(div2=Text()), Node("span"), body=Text(), ), )
class SubSense(PageModel): model_class = models.SubSense page_tree = StrictHtml( Node.optional("> div.SENSE-NUM"), Node.optional("> span.SYNTAX-CODING"), Node.optional("> span.STYLE-LEVEL")( style_level=Text() ), Node.optional("> span.SUBJECT-AREA")( subject_area=Text() ), Node.optional("> span.SYNTAX-CODING")( syntax_coding=Text() ), Node("> span.DEFINITION", "> span.QUICK-DEFINITION")( definition=Text() ), Node.list("> strong", "> span.SENSE-VARIANT span.BASE", "> span.MULTIWORD span.BASE").concat(" | ")( original_key=Text() ), Node.list("> div.EXAMPLES")( examples=Example() ), Node.optional("> div.THES"), )
class Example(PageModel): model_class = models.Example page_tree = StrictHtml( Node.optional("strong")( original_key=Text() ), Node.optional("div.SEP"), Node("p.EXAMPLE")( content=Text() ), )
class PostprocPage(PageModel): model_class = dict page_tree = Html(Node("div.lower")(lower=Text())) @classmethod def postproc(cls, dic): dic['upper'] = dic.pop('lower', '').upper() return dic
class AttrPage(PageModel): model_class = dict page_tree = Html( Node("a.mylink")( href=Attr("href"), title=Attr("title"), text=Text(), ))
class PhrasalVerbLink(PageModel): model_class = models.Link page_tree = Html( Node("a")( url=Attr("href"), key=Attr("title"), link_type=Constant("phrasal verbs"), part_of_speech=Constant("phrasal verb"), ) )
class RelatedWordLink(PageModel): model_class = models.Link page_tree = Html( Node("a")( Node.optional("span.PART-OF-SPEECH")( part_of_speech=Text() ), key=Attr("title"), url=Attr("href"), link_type=Constant("related words"), ) ) @classmethod def postproc(cls, dic): k = dic["key"] p = dic.get("part_of_speech", "") k = k[:-len(p)] k = k.strip() dic["key"] = k
class Entry(PageModel): model_class = models.Entry page_tree = Html( Node("div#headword div#headwordleft span.BASE")( original_key=Text() ), Node("div#headbar")( Node.optional("span.STYLE-LEVEL")( style_level=Text() ), Node.optional("span.PRON")( pron=Text() ), Node.optional("span.PART-OF-SPEECH")( part_of_speech=Text() ) ), Node.optional("div.SUMMARY div.p")( intro_paragraph=Text() ), Node.list("div.SENSE-BODY")( senses=Sense() ), Node.optional("div#phrases_container > ul")( Node.list("li")( phrs=PhraseLink() ) ), Node.optional("div#phrasal_verbs_container > ul")( Node.list("li")( phrvbs=PhrasalVerbLink() ) ), Node.optional("div.entrylist > ul")( Node.list("li")( relwrds=RelatedWordLink() ) ) ) @classmethod def postproc(cls, dic): dic['links'] = dic.pop('relwrds', []) dic['links'] += dic.pop('phrvbs', []) + dic.pop('phrs', []) return dic
class InvalidPage(PageModel): model_class = dict page_tree = Html( Node("div.one")(dupfield=Text()), Node("div.two")(dupfield=Text()))
class OptionalNodePage(PageModel): model_class = dict page_tree = Html(Node.optional("div.missing")(x=Text()))
class ThisClassElem(PageModel): model_class = dict page_tree = Html( Node("> div.head")(head=Text()), Node.optional("> div.tail")(tail=ThisClass()), )
class MissingNodePage(PageModel): model_class = dict page_tree = Html(Node("div.missing")(x=Text()))
class ThisClassPage(PageModel): model_class = dict page_tree = Html(Node("div.list")(li=ThisClassElem()))
class StrictPage(PageModel): model_class = dict page_tree = Html(StrictNode("div.strict")(Node("span"), ))
class ConstantPage(PageModel): model_class = dict page_tree = Html(Node("div.doesnotmatter")(const=Constant("myconstant")))
class ConcatPage(PageModel): model_class = dict page_tree = Html( Node("div.list")( Node.list("span.elem").concat(", ")(concatenated=Text())))
class TakefirstPage(PageModel): model_class = dict page_tree = Html(Node.list("div.listelem").take_first()(firstelem=Text()))
class NestedInnerPage(PageModel): model_class = dict page_tree = Html(Node("span")(innertxt=Text()))
class InvalidPageTwo(PageModel): model_class = dict page_tree = Html(Node("div")(Text()))
class ListPage(PageModel): model_class = dict page_tree = Html(Node(".list")(Node.list(".listelem")(mylistfield=Text())))