def handle(self, *args, **options): new_filings = 0 rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL" logger.info('SCRAPE_RSS_FILINGS - starting regular run') headers = {'User-Agent': USER_AGENT} data = None req = urllib2.Request(rss_url, data, headers) response = urllib2.urlopen(req) rssdata = response.read() #print rssdata results = parse_xml_from_text(rssdata) for filing_number in results: filing_entered = enter_or_skip_filing(filing_number) if filing_entered: new_filings += 1 # log the results of this run logger.info( "SCRAPE_RSS_FILINGS - completing regular run--created %s new filings" % new_filings) # update the global scrape time (don't do this at the start # of the script in case it dies before completion). set_update('scrape_rss_filings')
def handle(self, *args, **options): logger.info('FIND_NEW_FILINGS - starting regular run') highest_filing_number = Filing.objects.all().order_by( '-filing_number')[0].filing_number logger.info("highest previously available filing number: %s" % (highest_filing_number)) trial_file_number = highest_filing_number highest_available_file_number = highest_filing_number file_misses = 0 file_miss_threshold = 3 new_files = 0 while True: trial_file_number += 1 location = FEC_DOWNLOAD % (trial_file_number) try: result = urllib2.urlopen(location) logger.info("FIND_NEW_FILINGS: found new filing %s" % (location)) now = timezone.now() obj, created = Filing.objects.get_or_create( filing_id=trial_file_number, filing_number=trial_file_number, filing_type="E", defaults={ 'process_time': now, 'discovery_method': 'F' }) if created: new_files += 1 except urllib2.HTTPError: logger.info("FIND_NEW_FILINGS: filing unavailable at %s" % (location)) file_misses += 1 if file_misses >= file_miss_threshold: break sleep(1) logger.info( "FIND_NEW_FILINGS - completing regular run--created %s new filings" % new_files) # set the update time. set_update('incremental_find_filings')
def handle(self, *args, **options): logger.info('FIND_NEW_FILINGS - starting regular run') highest_filing_number = Filing.objects.all().order_by('-filing_number')[0].filing_number logger.info("highest previously available filing number: %s" % (highest_filing_number)) trial_file_number = highest_filing_number highest_available_file_number = highest_filing_number file_misses = 0 file_miss_threshold = 3 new_files = 0 while True: trial_file_number += 1 location = FEC_DOWNLOAD % (trial_file_number) try: result = urllib2.urlopen(location) logger.info("FIND_NEW_FILINGS: found new filing %s" % (location)) now = timezone.now() obj, created = Filing.objects.get_or_create(filing_id=trial_file_number, filing_number=trial_file_number, filing_type="E", defaults = {'process_time':now, 'discovery_method':'F'}) if created: new_files += 1 except urllib2.HTTPError: logger.info("FIND_NEW_FILINGS: filing unavailable at %s" % (location)) file_misses += 1 if file_misses >= file_miss_threshold: break sleep(1) logger.info("FIND_NEW_FILINGS - completing regular run--created %s new filings" % new_files) # set the update time. set_update('incremental_find_filings')
def handle(self, *args, **options): new_filings = 0 rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL" logger.info("SCRAPE_RSS_FILINGS - starting regular run") headers = {"User-Agent": USER_AGENT} data = None req = urllib2.Request(rss_url, data, headers) response = urllib2.urlopen(req) rssdata = response.read() # print rssdata results = parse_xml_from_text(rssdata) for filing_number in results: filing_entered = enter_or_skip_filing(filing_number) if filing_entered: new_filings += 1 # log the results of this run logger.info("SCRAPE_RSS_FILINGS - completing regular run--created %s new filings" % new_filings) # update the global scrape time (don't do this at the start # of the script in case it dies before completion). set_update("scrape_rss_filings")