def handle(self, *args, **options): log = logging.getLogger() log.setLevel(logging.INFO) # Create a datastore wrapper object ds = self.ds = CouncilmaticDataStoreWrapper() source = self.source = PhillyLegistarSiteWrapper( settings.LEGISLATION['ROOT']) # Seed the PDF cache with already-downloaded content. # # Downloading and parsing PDF content really slows down the scraping # process. If we had to redownload all of them every time we scraped, # it would take a really long time to refresh all of the old stuff. So # that PDFs that have already been downloaded won't be again, seed the # source cache with that data. # # Hopefully this won't be too much of a burden on memory :). source.init_pdf_cache(ds.pdf_mapping) update_files = options['update_files'] try: self._get_new_files() if update_files: self._get_updated_files() except TooManyGeocodeRequests: sys.exit(0)
def test_RecoversGracefullyAfterIntegrityError(self): from phillyleg.models import LegFile from django.db.utils import DatabaseError LegFile.objects.all().delete() LegFile.objects.create(title='testing', key=123) ds = CouncilmaticDataStoreWrapper() try: ds._save_or_ignore(LegFile, {'title': 'testing', 'key': 123}) ds._save_or_ignore(LegFile, {'title': 'testing', 'key': 123}) except DatabaseError: self.fail('Shouldn\'t have raised a DatabaseError') else: pass
def _get_new_files(self, force_download): # Create a datastore wrapper object ds = CouncilmaticDataStoreWrapper() source = ScraperWikiSourceWrapper() # Get the latest filings curr_key = ds.get_latest_key() while True: curr_key, source_obj = source.check_for_new_content( curr_key, force_download) if source_obj is None: break record, attachments, actions, minutes = \ source.scrape_legis_file(curr_key, source_obj) ds.save_legis_file(record, attachments, actions, minutes)