Пример #1
0
    def import_xml(self, filename):
        parser = lxml.etree.XMLParser(dtd_validation=False, load_dtd=False,
                                      resolve_entities=False, encoding="utf8")
        with open(filename) as f:
            source = os.path.basename(filename)
            file_contents = unescape(f.read())
            file_contents = re.sub('''^<\?xml version="1.0" encoding="UTF-8"\s*\??>''', 
                    '', file_contents)
            file_contents = file_contents.replace("&", "&amp;")
            xml = lxml.etree.fromstring(file_contents, parser=parser)

            try:
                self.title = xml.attrib['titlenum'].lstrip('0')
            except KeyError:
                return

            try:
                title_section = xml.xpath('//hdsupnest')[0].text
                match = re.match("TITLE (?P<title>\w+)\s*(?:-|&mdash;)\s*(?P<name>\w+)", 
                        title_section)
                if match:
                    result = Law.objects.get_or_create(
                                              title=match.group('title').lstrip('0'),
                                              section="",
                                              psection="",
                                              defaults={
                                                  'text': title_section,
                                                  'order': self.ordering,
                                                  'level': 0,
                                              })
                    self.ordering += 1
            except IndexError:
                pass

            sections = xml.xpath('//section')
            if len(sections) == 0:
                return
            self.section = sections[0].attrib['num']

            matches = Law.objects.filter(
                title=self.title,
                section=self.section,
                psection="")
            if matches:
                law = matches[0]
            else:
                law = Law(title=self.title,
                          section=self.section,
                          psection="")
            law.order = self.ordering
            body = self.xslt(xml).xpath('//xhtml:body/xhtml:div',
                                        namespaces={
                    'xhtml': 'http://www.w3.org/1999/xhtml'})[0]
            law.text = unicode(lxml.etree.tostring(body))
            law.source = source
            law.set_name()
            law.save()

            for sect_text in xml.xpath('//section/sectioncontent/text'):
                self.ordering += 1
                l2 = Law(
                    title=self.title,
                    section=self.section,
                    psection="",
                    order=self.ordering,
                    text=unicode(sect_text.xpath('string()')),
                    source=source)
                l2.set_name()
                l2.save()

            for psection in xml.xpath('//section/sectioncontent/psection'):
                self.parse_psection(psection, [], source)
Пример #2
0
    def parse_psection(self, psection, parts, source):
        parts.append(psection.xpath('string(enum)'))
        psection_id = psection.attrib['id']

        # Get references
        ref_laws = []
        for ref in psection.xpath('text/aref'):
            for subref in ref.xpath('subref'):
                if subref.attrib['type'] == 'title':
                    match = re.match( r"usc_sup_01_([^_])", 
                            subref.attrib['target'])
                    if match:
                        (title,) = match.groups()
                        title = title.lstrip('0')
                        section = ""
                        ref_psec_id = ""
                    else:
                        continue

                elif subref.attrib['type'] in ['sec', 'psec']:
                    match = re.match(
                        r"usc_sec_(?P<title>\d+)_(?P<section>[^-]+)-*(?P<section2>[0-9A-Za-z]*)-?(?:\#(?P<psection>\w+))?",
                        subref.attrib['target'])
                    if not match:
                        continue
                    (title, sec1, sec2, ref_psec_id) = match.groups()
                    title = title.lstrip('0')
                    section = sec1.lstrip('0') + sec2.rstrip('0')
                    ref_psec_id = ref_psec_id or ""

                else:
                    continue

                matches = Law.objects.filter(
                    title=title,
                    section=section,
                    psection=ref_psec_id)
                if len(matches) == 0:
                    ref_law = Law.objects.create(
                            title=title,
                            section=section,
                            psection=ref_psec_id,
                            order=0)
                else:
                    ref_law = matches[0]
                ref_laws.append(ref_law)

        for sub_element in psection:
            if sub_element.tag in ["text", "head"]:
                self.ordering += 1
                matches = Law.objects.filter(
                        title=self.title,
                        section=self.section,
                        psection=psection_id)
                if len(matches) == 1 and not matches[0].source:
                    law = matches[0]
                else:
                    law = Law(
                        title=self.title,
                        section=self.section,
                        psection=psection_id)
                law.level = int(psection.attrib['lev'])
                law.text = unicode(sub_element.xpath('string()') or "")
                law.order = self.ordering
                law.source = source
                law.set_name(parts)
                law.save()
            elif sub_element.tag == "psection":
                self.parse_psection(sub_element, parts, source)
        if ref_laws:
            first = Law.objects.filter(title=self.title, section=self.section, 
                    psection=psection_id)[0]
            first.references = ref_laws
        parts.pop()