示例#1
0
    def handle(self, *args, **options):

        new_filings = 0
        rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL"

        logger.info('SCRAPE_RSS_FILINGS - starting regular run')
        headers = {'User-Agent': USER_AGENT}
        data = None
        req = urllib2.Request(rss_url, data, headers)
        response = urllib2.urlopen(req)
        rssdata = response.read()

        #print rssdata
        results = parse_xml_from_text(rssdata)
        for filing_number in results:
            filing_entered = enter_or_skip_filing(filing_number)
            if filing_entered:
                new_filings += 1

        # log the results of this run
        logger.info(
            "SCRAPE_RSS_FILINGS - completing regular run--created %s new filings"
            % new_filings)
        # update the global scrape time (don't do this at the start
        # of the script in case it dies before completion).
        set_update('scrape_rss_filings')
示例#2
0
    def handle(self, *args, **options):

        logger.info('FIND_NEW_FILINGS - starting regular run')

        highest_filing_number = Filing.objects.all().order_by(
            '-filing_number')[0].filing_number
        logger.info("highest previously available filing number: %s" %
                    (highest_filing_number))
        trial_file_number = highest_filing_number
        highest_available_file_number = highest_filing_number
        file_misses = 0
        file_miss_threshold = 3
        new_files = 0

        while True:
            trial_file_number += 1
            location = FEC_DOWNLOAD % (trial_file_number)
            try:
                result = urllib2.urlopen(location)
                logger.info("FIND_NEW_FILINGS: found new filing %s" %
                            (location))
                now = timezone.now()
                obj, created = Filing.objects.get_or_create(
                    filing_id=trial_file_number,
                    filing_number=trial_file_number,
                    filing_type="E",
                    defaults={
                        'process_time': now,
                        'discovery_method': 'F'
                    })
                if created:
                    new_files += 1

            except urllib2.HTTPError:
                logger.info("FIND_NEW_FILINGS: filing unavailable at %s" %
                            (location))
                file_misses += 1

            if file_misses >= file_miss_threshold:
                break

            sleep(1)
            logger.info(
                "FIND_NEW_FILINGS - completing regular run--created %s new filings"
                % new_files)

        # set the update time.
        set_update('incremental_find_filings')
    def handle(self, *args, **options):
        
        logger.info('FIND_NEW_FILINGS - starting regular run')
        
        highest_filing_number = Filing.objects.all().order_by('-filing_number')[0].filing_number
        logger.info("highest previously available filing number: %s" % (highest_filing_number))
        trial_file_number = highest_filing_number
        highest_available_file_number = highest_filing_number
        file_misses = 0
        file_miss_threshold = 3
        new_files = 0
        
        while True:
            trial_file_number += 1 
            location = FEC_DOWNLOAD % (trial_file_number)
            try:
                result = urllib2.urlopen(location)
                logger.info("FIND_NEW_FILINGS: found new filing %s" % (location))
                now = timezone.now()
                obj, created = Filing.objects.get_or_create(filing_id=trial_file_number, filing_number=trial_file_number, filing_type="E", defaults = {'process_time':now, 'discovery_method':'F'})
                if created:
                    new_files += 1
                                

            except urllib2.HTTPError:
                logger.info("FIND_NEW_FILINGS: filing unavailable at %s" % (location))
                file_misses += 1
                
            if file_misses >= file_miss_threshold:
                break
                
            sleep(1)
            logger.info("FIND_NEW_FILINGS - completing regular run--created %s new filings" % new_files)
        
        # set the update time. 
        set_update('incremental_find_filings')
    def handle(self, *args, **options):

        new_filings = 0
        rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL"

        logger.info("SCRAPE_RSS_FILINGS - starting regular run")
        headers = {"User-Agent": USER_AGENT}
        data = None
        req = urllib2.Request(rss_url, data, headers)
        response = urllib2.urlopen(req)
        rssdata = response.read()

        # print rssdata
        results = parse_xml_from_text(rssdata)
        for filing_number in results:
            filing_entered = enter_or_skip_filing(filing_number)
            if filing_entered:
                new_filings += 1

        # log the results of this run
        logger.info("SCRAPE_RSS_FILINGS - completing regular run--created %s new filings" % new_filings)
        # update the global scrape time (don't do this at the start
        # of the script in case it dies before completion).
        set_update("scrape_rss_filings")