def processTitle( self, title, parent): code = self.findOrCreateCode(title['identifier']) code.parent = parent code.depth = 1 code.pre = Text.make(title.num.text + title.heading.text) self.progress(label=code.cid) chapters = title.select("chapter[identifier]") log.debug('Processing %i chapters for title %s' % (len(chapters), code.pre.text)) self.progress(i=len(chapters)) for chapter in chapters: self.processChapter(chapter, code) self.progress()
def processAppendix(self, appendix, parent): # assume the title can be identified by removing the 'a' from the appendix identifier title = self.findOrCreateCode(appendix['identifier'][:-1]) log.debug('Processing appendix for %s' % title.cid) self.progress(label=appendix.cid) chapter_cid = title.cid + '/appendix' chapter = self.findOrCreateCode(chapter_cid) chapter.parent = title chapter.depth = 2 chapter.pre = Text.make(appendix.heading.text) sections = appendix("section") sections += appendix("courtRule") if sections: self.progress(i=len(sections)) for section in sections: self.processSection(section, chapter)
def processChapter( self, chapter, parent): # exclude omitted or repealed chapters if chapter.has_attr('status') and chapter['status'] in ('omitted', 'repealed'): pass else: code = self.findOrCreateCode(chapter['identifier']) code.depth = 2 code.parent = parent code.pre = Text.make(chapter.num.text + chapter.heading.text) self.progress(label=code.cid) sections = chapter.select("section[identifier]") self.progress(i=len(sections)) if sections: log.debug('Processing chapter %s with %i sections:' % (code.cid, len(sections))) for section in sections: self.processSection(section, code) self.progress()
def processAct( self, actcid, parent ): log.info('Processing act: %s' % actcid) soup = self.getActSoup(actcid) act = self.findOrCreateAct(parent.released, actcid, parent.rev) act.parent = parent act.cid = actcid act.released = parent.released act.rev = parent.rev act.depth = 1 act.pre = Text.make( soup.title.text ) act.text = Text.make( soup.select("section.intro")[0].text ) act.meta = utf8(soup.select("p#assentedDate")[0].text.rpartition('.')[0]) doc = soup.select("div.docContents div")[0] #so much easier to use the CSS selector #sections = [i['id'] for i in doc.select("[id]") if i['id'].startswith('h-')] id_prefix = 'h-' sections = [i['id'] for i in doc.select('[id^={}]'.format(id_prefix))] classAndTag = lambda o: isinstance(o, Tag) and o.has_attr('class') if sections: for secid in progress.bar(sections, label=act.cid): sec = self.findOrCreateSection(act.released, secid, act) soup = doc.select("[id=%s]" % secid)[0] sec.pre = Text.make(soup.text) sec.cid = secid sec.depth=2 sec.parent = act sec.released = act.released sec.rev = act.rev stop = False sib = soup.nextSibling content = "" for t in soup.select(".wb-invisible"): t.clear() while not stop: if classAndTag(sib): if sib.has_attr('id') and sib['id'].startswith('h-'): stop = True elif sib.name == 'section': stop = True elif any( c in ['Definition', 'Section', 'MarginalNote', 'ProvisionList', 'Part', 'Subheading', 'MarginalNoteDefinedTerm', 'ContinuedSectionSubsection', 'Oath'] for c in sib['class']): content += sib.text elif sib['class'][0].startswith('indent'): content += sib.text elif sib['class'][0] == 'HistoricalNote': sec.meta = utf8(sib.text) elif sib['class'][0] in ['PITLink', 'nif']: pass else: log.info('Unhandled case in parsing section %s/%s' % (act.cid, secid)) log.debug(sib.name) log.debug(sib.attrs) if not sib or not sib.nextSibling: stop = True if not stop: sib = sib.nextSibling sec.text = Text.make(content) sec.stored = now() schedules = soup.select('div [class=Schedule]') post = '' for sched in schedules: post += sched.text act.post = Text.make(post) act.stored = now() else: #alternative section method #for this method we switch to the XML version and pull identifying information #out of the code = attribute. Annecdotally, this seems to need to be done for #very small acts log.info('Switching to alternate section method') soup = self.getActXMLSoup(act.cid) sections = soup.select("section[code^=se=]") for section in sections: try: secid = section['code'].replace('=', '-').replace('"', '') pre = '' pre = section.label.text + ' ' if section.label else pre pre = pre + section.marginalnote.text if section.marginalnote else pre text = section.select_one('text').text except: log.warn('ERROR in alternate parsing method for {}.{}'.format(act.cid, secid)) raise if 'repealed' in text.lower(): pass else: sec = self.findOrCreateSection(act.released, secid, act) sec.setPre(pre) sec.setText(text) sec.parent = act sec.depth = 2 sec.released = act.released sec.rev = act.rev sec.cid = secid act.analyze() store.commit() return act