示例#1
0
    def parse(self, doc):
        # some very simple heuristic rules for determining 
        # what an individual paragraph is
   
        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True
  
        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p
        
        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))
  
        # First paragraph of an RFC is always a header block 
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)
  
        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()
        
        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para])) 

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dct:title of the document and 
        # specify that it is in English
        desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en")

        # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile)
  
        # find and convert the publication date in the header to a datetime 
        # object, and set it as the dct:issued date for the document   
        re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale(): 
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year,dt.month,dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dct'].issued, pubdate)
  
        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dct:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header, re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dct'].subject, cat_match.group(1))
            
        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs, 
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)
  
        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
        # Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
# begin parse2                                   
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ",1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title, ordinal=ordinal, identifier=identifier)
                stack[1:] = [] # clear all but bottom element
                stack[0].append(s) # add new section to body
                stack.append(s)    # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[2:] = [] # clear all but bottom two elements
                stack[1].append(s) # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[3:] = [] # clear all but bottom three
                stack[-1].append(s) # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2                                   

# begin citation1                                   
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef")
# end citation1                                   

# begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                 uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                 uri += "#S" + parts['Sec']
            return uri
# end citation2                                   

# begin citation3
        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, 
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
示例#2
0
文件: rfc.py 项目: h4ck3rm1k3/ferenda
    def parse_header(self, header, desc):
        # split header in left-hand and right-hand side, and line by line
        lines = header.split("\n")
        left = [x.split("   ", 1)[0].strip() for x in lines]
        right = [x.split("   ", 1)[1].strip() for x in lines if "   " in x]
        # first line of lefthand side is publishing organization (?)
        desc.value(self.ns['dct'].publisher, left[0])
        # following lefthand side are key-value headers
        for line in left[1:]:
            if line.strip() == "":
                continue
            if ": " not in line:
                self.log.warning("Cannot treat %r as a key-value header" % line)
                continue

            (key, value) = (x.strip() for x in line.split(": "))
            if key == "Request for Comments":
                # make sure we only extract the numeric part --
                # normally value should be numeric, but we've seen
                # "RFC 1006", "#154" and there are doubtless other
                # variants
                value = re.sub("\D", "", value)
                if value:  # eg RFC 100
                    desc.value(self.ns['dct'].identifier, "RFC %s" % value)
            elif key == "Category":
                desc.value(self.ns['dct'].subject, value)
            elif key == "ISSN":
                desc.value(self.ns['dct'].issn, value)
            elif key in ("Updates", "Obsoletes"):
                pred = {'Updates': self.ns['rfc'].updates,
                        'Obsoletes': self.ns['rfc'].obsoletes}[key]

                for valuepart in value.split(", "):
                    rfcmatch = re.search('\d+', valuepart)
                    if rfcmatch:
                        uri = self.canonical_uri(rfcmatch.group(0))
                        desc.rel(pred, uri)
                    else:
                        self.log.warning("Can't pick out RFC number from line %s" % line)
            elif key == "BCP":
                desc.value(self.ns['rfc'].BCP, value)
            elif key == "STD":
                desc.value(self.ns['rfc'].STD, value)
            elif key == "FYI":
                desc.value(self.ns['rfc'].FYI, value)
            else:
                # Unknown headers seen: BCP, STD, FYI
                self.log.warning("Unknown header key %s (value %s)" % (key, value))

        # For right hand side, any line beginning with a single letter
        # followed by '. ' is probably a name
        for line in right:
            if re.match("[A-Z]\. ", line):
                desc.value(self.ns['dct'].creator, line)
            elif re.match("\w+ \d{4}$", line):
                # NOTE: this requires english locale!
                with util.c_locale():
                    dt = datetime.strptime(line, "%B %Y")
                d = date(dt.year, dt.month, dt.day)
                desc.value(self.ns['dct'].issued, d)
            else:
                # company affiliation - include that separate from
                # personal author identity
                desc.value(self.ns['dct'].rightsHolder, line)
示例#3
0
文件: dv.py 项目: h4ck3rm1k3/ferenda
    def polish_metadata(self, head, doc):
        basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')

        def basefile_to_referat(basefile):
            templ = {'ADO': 'AD %(year)s nr %(ordinal)s',
                     'MD': 'MD %(year)s:%(ordinal)s'}
            m = basefile_regex.match(basefile)
            if m:
                return templ[m.group("type")] % (m.groupdict())

        def ref_to_uri(ref):
            # FIXME: We'd like to retire legalref and replace it with
            # pyparsing grammars.
            nodes = self.rattsfall_parser.parse(ref)
            uri = nodes[0].uri
            return localize_uri(uri)

        def dom_to_uri(domstol, malnr, avg):
            baseuri = self.config.url
            slug = self.slugs[domstol]
            return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals()

        def localize_uri(uri):
            if "publ/rattsfall" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall",
                                   self.config.url + "res/dv")
            elif "publ/sfs/" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/sfs",
                                   self.config.url + "res/sfs")

        def split_nja(value):
            # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86")
            return [x[:-1] for x in value.split("(")]

        def sokord_uri(value):
            return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')

        # 0. create Referat key if not present
        if "Referat" not in head:
            # For some courts (MD, AD, MOD?, MIG?) this is possible
            head["Referat"] = basefile_to_referat(doc.basefile)

        # 1. mint uris and create the two Describers we'll use
        refuri = ref_to_uri(head["Referat"])
        refdesc = Describer(doc.meta, refuri)
        domuri = dom_to_uri(head["Domstol"],
                            head["Målnummer"],
                            head["Avgörandedatum"])
        domdesc = Describer(doc.meta, domuri)

        # 2. convert all strings in head to proper RDF
        for label, value in head.items():
            if label == "Rubrik":
                value = util.normalize_space(value)
                refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv")
                domdesc.value(self.ns['dct'].title, value, lang="sv")

            elif label == "Domstol":
                domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value))
            elif label == "Målnummer":
                domdesc.rel(self.ns['rpubl'].malnummer, value)
            elif label == "Domsnummer":
                domdesc.rel(self.ns['rpubl'].domsnummer, value)
            elif label == "Diarienummer":
                domdesc.rel(self.ns['rpubl'].diarienummer, value)
            elif label == "Avdelning":
                domdesc.rel(self.ns['rpubl'].avdelning, value)
            elif label == "Referat":

                for pred, regex in {'rattsfallspublikation': r'([^ ]+)',
                                    'arsutgava': r'(\d{4})',
                                    'lopnummer': r'\d{4}(?:\:| nr )(\d+)',
                                    'sidnummer': r's.? ?(\d+)'}.items():
                    m = re.search(regex, value)
                    if m:
                        if pred == 'rattsfallspublikation':
                            # "NJA" -> "http://lcaolhost:8000/coll/dv/nja"
                            uri = self.config.url + "coll/dv/" + m.group(1).lower()
                            refdesc.rel(self.ns['rpubl'][pred], uri)
                        else:
                            refdesc.value(self.ns['rpubl'][pred], m.group(1))

                    if value.startswith("NJA"):
                        realvalue, extra = split_nja(value)
                        ordinal = extra.split(" ")[1]
                        refdesc.value(self.ns['dct'].bibliographicCitation,
                                      extra)
                        refdesc.rel(self.ns['owl'].sameAs,
                                    self.config.url + "res/dv/nja/" + ordinal)
                        refdesc.value(self.ns['dct'].identifier, realvalue)
                    else:
                        refdesc.value(self.ns['dct'].identifier, value)

            elif label == "Avgörandedatum":
                with util.c_locale():
                    d = datetime.strptime(value, '%Y-%m-%d')
                domdesc.value(self.ns['rpubl'].avgorandedatum, d)

            elif label == "Lagrum":
                for i in value:  # better be list not string
                    for node in self.lagrum_parser.parse(i):
                        if isinstance(node, Link):

                            domdesc.rel(self.ns['rpubl'].lagrum,
                                        localize_uri(node.uri))
            elif label == "Rättsfall":
                for i in value:
                    for node in self.rattsfall_parser.parse(i):
                        if isinstance(node, Link):
                            domdesc.rel(self.ns['rpubl'].rattsfall,
                                        localize_uri(node.uri))
            elif label == "Litteratur":
                for i in value.split(";"):
                    domdesc.value(self.ns['dct'].relation, util.normalize_space(i))
            elif label == "Sökord":
                for s in self.re_delimSplit(value):
                    s = util.normalize_space(s)
                    if not s:
                        continue
                    # terms longer than 72 chars are not legitimate
                    # terms. more likely descriptions. If a term has a - in
                    # it, it's probably a separator between a term and a
                    # description
                    while len(s) >= 72 and " - " in s:
                        h, s = s.split(" - ", 1)
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(h))
                    if len(s) < 72:
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(s))

        # 3. mint some owl:sameAs URIs
        refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri))
        domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri))

        # 4. Add some same-for-everyone properties
        refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket'))
        refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat)
        domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande)
        refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri)
        # 5. assert that we have everything we need

        # 6. done!
        return refuri
示例#4
0
    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)