示例#1
0
 def scrape ( self ):
     acts, datemodified = self.getActs()
     lastroot = self.getRoot()
     if datemodified is not lastroot.released:
         root = self.makeRoot(datemodified=datemodified)
     else:
         root = lastroot
     for act in acts:
         self.processAct( act, root )
     root.analyze()
     store.commit()
示例#2
0
 def processFile( self, fn, root ):
     log.debug('Processing file %s...' % fn)
     self.progress(label=fn)
     soup = BS( open(fn, 'r').read() )
     if fn.lower().endswith('a.xml'):
         # we think this is an appendix, let's try to find an appendix
         # any appendix not identified in the form /us/usc/tN[aA] will NOT be processed
         appendices = soup.select('appendix[identifier$=a]')
         appendices += soup.select('appendix[identifier$=A]')
         self.progress(i=len(appendices))
         for appendix in appendices:
             log.debug('Processing appendix: %s@%s' % ( soup.appendix['identifier'], self.rp ) )
             self.processAppendix(appendix, root)
     else:
         self.processTitle(soup.title, root)
         log.debug('Finished processing file %s...' % fn)
         store.commit()
示例#3
0
 def scrape (self):
     if not self.rp:
         r = Cache.get(self.nation.cfg['entrypoint'])
         soup = BS(str(r))
         #find current release point
         log.info("No release point specified, retreiving latest...")
         # this failed fantastically - we'll get the RP from the zipurl
         #self.rp = utf8(soup.findAll('h3', attrs={'class': 'releasepointinformation'})[0].text.split()[-1])
         log.info("Found release point %s" % self.rp)
         #find the download url
         self.zipurl = self.nation.cfg['entrypoint'].rpartition('/')[0] + '/' + soup.findAll('a', title='All USC Titles in XML')[0]['href']
         # new way to set the rp using the zipurl's filename
         self.rp = utf8( self.zipurl.rpartition('@')[-1].partition('.')[0] )
     else:
         log.info('Using specified release point %s...' % self.rp)
         # don't actually need this
         # rpurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/usc-rp@%s.htm' % (tuple(self.rp.split()) + (self.rp,))
         self.zipurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/xml_uscAll@%s.zip'  % (tuple(self.rp.split('-')) + (self.rp,))
     
     log.debug("Using zipurl: %s" % self.zipurl)
     
     class FileNotThere (Exception): pass
     class XMLNotThere( Exception ): pass
     class AllGood( Exception ): pass
     
     filename = self.zipurl.rpartition('/')[-1]
     xmldir = self._workdir + os.sep + 'xml' + os.sep
     
     # check to see if we have xml that works
     # if we don't check to see if we have a zip file
     # if we don't, download it
     # if we do, extract it
     # check the xml again, if it's good, proceed
     # if it's not, error out
     
     try:
         assert os.path.exists(xmldir + 'usc01.xml')
         soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read())
         xmlrp = soup.find('docpublicationname').text.split('@')[-1]
         #old way to get rp, the new way is much better
         # xmlrp = soup.title.first("note", topic="miscellaneous").text.split()[-1]
         if xmlrp == self.rp:
             raise AllGood
         else:
             raise XMLNotThere
     except (XMLNotThere,AssertionError):
         # delete directory if it exists
         if os.path.exists(xmldir):
             shutil.rmtree(xmldir)
         # if there's no xml file, download it
         if not os.path.exists(self._workdir + os.sep + filename):
             log.info('No zipfile found for release point, downloading...')
             self.downloadFile(self.zipurl, filename)
         # now we should have a zipfile and no existing xmldir
         log.info('Extracting file %s...' % filename)
         zf = ZipFile(self._workdir + os.sep + filename, 'r')
         # older release points do not have an interior xml/ dir
         if not all( [ n.startswith('xml/') for n in zf.namelist()]):
             zf.extractall(xmldir)
         else:
             zf.extractall(self._workdir)
         # double check the xml now...
         assert os.path.exists(xmldir + 'usc01.xml')
         # it may be problematic to rely on the RP information in the XML documents provided
         # rp 113-21 (the earliest presently available) does not include this in the 
         # docpublicationname meta tag
         #soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read())
         #xmlrp = soup.find('docpublicationname').text.split('@')[-1]
         #if xmlrp != self.rp:
         #    raise XMLNotThere('XML did not check out after extraction.')
     except AllGood:
         pass
     except:
         raise
     
     log.info('All checks passed...')
     xf = os.listdir(xmldir)
     root = self.findOrCreateRoot()
     xf = [xmldir + f for f in xf if f.endswith('.xml')]
     xf.sort()
     log.info("Processing %i files..." % len(xf))
     self.bar = progress.Bar(label='US', expected_size=1000*len(xf))
     self.progress( i=len(xf) )
     for fn in xf:
         self.processFile(fn, root)
         self.progress(rollup=1000)
     log.info('Analyzing code...')
     self.progress(label="Analyzing")
     root.analyze(commit=True, bar=self.bar)
     store.commit()
     log.info('Scrape completed.')
示例#4
0
 def processAct( self, actcid, parent ):
     log.info('Processing act: %s' % actcid)
     soup = self.getActSoup(actcid)
     act = self.findOrCreateAct(parent.released, actcid, parent.rev)
     act.parent = parent
     act.cid = actcid
     act.released = parent.released
     act.rev = parent.rev
     act.depth = 1
     act.pre = Text.make( soup.title.text )
     act.text = Text.make( soup.select("section.intro")[0].text )
     act.meta = utf8(soup.select("p#assentedDate")[0].text.rpartition('.')[0])
     doc = soup.select("div.docContents div")[0]
     #so much easier to use the CSS selector
     #sections = [i['id'] for i in doc.select("[id]") if i['id'].startswith('h-')]
     id_prefix = 'h-'
     sections = [i['id'] for i in doc.select('[id^={}]'.format(id_prefix))]
     classAndTag = lambda o: isinstance(o, Tag) and o.has_attr('class')
     
     if sections:
         for secid in progress.bar(sections, label=act.cid):
             sec = self.findOrCreateSection(act.released, secid, act)
             soup = doc.select("[id=%s]" % secid)[0]
             sec.pre = Text.make(soup.text)
             sec.cid = secid
             sec.depth=2
             sec.parent = act
             sec.released = act.released
             sec.rev = act.rev
             stop = False
             sib = soup.nextSibling
             content = ""
             for t in soup.select(".wb-invisible"):
                 t.clear()
             while not stop:
                 if classAndTag(sib):
                     if sib.has_attr('id') and sib['id'].startswith('h-'):
                         stop = True
                     elif sib.name == 'section':
                         stop = True
                     elif any( c in ['Definition', 
                                     'Section',
                                     'MarginalNote', 
                                     'ProvisionList', 
                                     'Part', 
                                     'Subheading', 
                                     'MarginalNoteDefinedTerm',
                                     'ContinuedSectionSubsection',
                                     'Oath'] for c in sib['class']):
                         content += sib.text
                     elif sib['class'][0].startswith('indent'):
                         content += sib.text
                     elif sib['class'][0] == 'HistoricalNote':
                         sec.meta = utf8(sib.text)
                     elif sib['class'][0] in ['PITLink',
                                              'nif']:
                         pass
                     else:
                         log.info('Unhandled case in parsing section %s/%s' % (act.cid, secid))
                         log.debug(sib.name)
                         log.debug(sib.attrs)
                 if not sib or not sib.nextSibling:
                     stop = True
                 if not stop:
                     sib = sib.nextSibling
             sec.text = Text.make(content)
             sec.stored = now()
             schedules = soup.select('div [class=Schedule]')
             post = ''
             for sched in schedules:
                 post += sched.text
             act.post = Text.make(post)
             act.stored = now()
     else:
         #alternative section method
         #for this method we switch to the XML version and pull identifying information 
         #out of the code = attribute. Annecdotally, this seems to need to be done for
         #very small acts
         log.info('Switching to alternate section method')
         soup = self.getActXMLSoup(act.cid)
         sections = soup.select("section[code^=se=]")
         for section in sections:
             try:
                 secid = section['code'].replace('=', '-').replace('"', '')
                 pre = ''
                 pre = section.label.text + ' ' if section.label else pre
                 pre = pre + section.marginalnote.text if section.marginalnote else pre
                 text = section.select_one('text').text
             except:
                 log.warn('ERROR in alternate parsing method for {}.{}'.format(act.cid, secid))
                 raise
             if 'repealed' in text.lower():
                 pass
             else:
                 sec = self.findOrCreateSection(act.released, secid, act)
                 sec.setPre(pre)
                 sec.setText(text)
                 sec.parent = act
                 sec.depth = 2
                 sec.released = act.released
                 sec.rev = act.rev
                 sec.cid = secid
     act.analyze()
     store.commit()
     return act