Пример #1
0
 def parse_list(self, page):
     # parse a single detail string page into record dicts
     for photo in simplejson.loads(page)['photos']['photo']:
         if self.records_seen > self.max_photos_per_scrape:
             raise StopScraping("We've reached %d records" %
                                self.max_photos_per_scrape)
         self.records_seen += 1
         yield photo
Пример #2
0
    def clean_detail_record(self, record):
        record['expiration_date'] = parse_date(record['expiration_date'],
                                               '%m/%d/%Y')
        record['issue_date'] = parse_date(record['issue_date'], '%m/%d/%Y')

        # The PDF text is in the ISO-8859-1 encoding. Convert it here so that
        # we don't get an encoding error when we save it to the database.
        record['text'] = record['text'].decode('iso-8859-1')
        record['text'] = record['text'].replace(
            'Display This Permit While Work Is In Progress', '')
        record['text'] = record['text'].strip()

        # Remove the "ISSUED TO" section, as it contains the name of the person
        # who owns the property, and we have a policy of not displaying names.
        # Note that we include a sanity check that the "ISSUED TO" section
        # doesn't contain more than 9 newlines, as that would signify a broken
        # regular expression.
        m = issued_re.search(record['text'])
        if m and m.group(0).count('\n') < 10:
            record['text'] = issued_re.sub('', record['text'])

        if record['second_line_of_district'].strip():
            record['historic_district'] += record['second_line_of_district']

        if record['district_or_landmark'] == 'HISTORIC DISTRICT':
            record['landmark'] = 'N/A'
        else:
            record['landmark'] = record['historic_district']
            record['historic_district'] = 'N/A'

        # Check for a duplicate record. Because the scraper works in
        # reverse-chronological order, we can safely raise StopScraping if we
        # reach a duplicate.
        try:
            qs = NewsItem.objects.filter(schema__id=self.schema.id)
            qs = qs.by_attribute(self.schema_fields['docket'],
                                 record['docket'])
            qs = qs.by_attribute(self.schema_fields['cofa'], record['cofa'])
            old_record = qs[0]
        except IndexError:
            pass
        else:
            raise StopScraping('Found a duplicate record %s' % old_record.id)

        return record
Пример #3
0
    def list_pages(self):
        """generate page strings."""

        # XXX argh we apparently need the api_secret, and thus the token / frob dance?
        # even though this method doesn't need authentication???
        flickr = flickrapi.FlickrAPI(self.api_key, self.api_secret)
        extent = ','.join(
            [str(coord) for coord in get_default_bounds().extent])

        # Result of each iteration is a JSON string.
        pagenum = 0
        pages = float('inf')
        while pagenum < pages:
            pagenum += 1
            page = flickr.photos_search(
                has_geo=1,
                bbox=extent,
                safe_search='1',
                min_taken_date=self.min_timestamp,
                max_taken_date=self.max_timestamp,
                per_page='400',
                page=str(pagenum),
                extras=
                'date_taken,date_upload,url_sq,description,geo,owner_name',
                format='json',
                content_type='1',  # photos only.
                nojsoncallback='1',
            )

            # Ugh, we need to find out how many pages there are, so we parse here
            # and also in parse_list().
            adict = simplejson.loads(page)
            try:
                pages = int(adict['photos']['pages'])
            except KeyError:
                if adict.get('stat') == 'fail':
                    self.logger.error("Flickr error code %r: %s" %
                                      (adict['code'], adict['message']))
                else:
                    self.logger.error("Page content:\n%s" % page)
                raise StopScraping(
                    "Parsing error, missing 'photos' or 'pages', see above.")
            yield page
Пример #4
0
 def handle_ratelimit_exceeded(self, url, reset_time=None):
     """
     Either sleep until rate limit expires, and retry the url;
     or raise StopScraping, depending on options.
     """
     import time
     if reset_time is None:
         reset_time = time.time() + 3600
     msg = ("Hit rate limit. Resets at %s" %
            datetime.datetime.fromtimestamp(reset_time).ctime())
     sleep_time = reset_time - time.time()
     self.logger.info(msg)
     if self.options.wait_for_rate_limit:
         self.logger.info("Sleeping %.2f seconds" % sleep_time)
         time.sleep(sleep_time)
         self.logger.info("Wait limit should be expired, resuming")
     else:
         raise StopScraping(msg)
     page, headers = self.retriever.fetch_data_and_headers(
         url, raise_on_error=False)
     return page, headers
Пример #5
0
    def list_pages(self):
        """generate page ... well, not strings, but decoded JSON structures."""
        # TODO: This fetches a ton of data, which is maybe useful for
        # bootstrapping but very inefficient for getting updates.
        # For that we should support meetup's streaming API,
        # which allows passing a start time.

        # Result of each iteration is a JSON structure.
        # Normally in list_detail scrapers we return a string,
        # and leave parsing to parse_list(); but here we need to
        # parse to figure out pagination.
        from ebpub.db.models import Location
        metro = get_metro()
        city, state = metro['city_name'], metro['state']
        # We rotate among zip codes, fetching one page at a time for
        # each, since it's possible/likely that we will hit a rate
        # limit; this way, all the zip codes should get *something*.
        zipcode_state = {}
        ratelimit_remaining = 99999
        while True:
            for zipcode in Location.objects.filter(
                    location_type__slug='zipcodes'):
                zipcode = zipcode.slug
                zipcode_state.setdefault(zipcode, {
                    'page': int(self.options.start_page),
                    'done': False
                })
                if zipcode_state[zipcode]['done']:
                    continue
                try:
                    int(zipcode)
                except ValueError:
                    # meetup will barf on these.
                    self.logger.info(
                        "Skipping %s, doesn't look like a valid US zip code" %
                        zipcode)
                    continue

                params = dict(
                    zip=zipcode,
                    key=api_key,
                    city=city,
                    state=state,
                    country='US',
                    time='-1m,2m',
                )
                pagenum = zipcode_state[zipcode]['page']
                self.logger.info("Page %s for zip code %s" %
                                 (pagenum, zipcode))
                params['offset'] = pagenum
                url = 'https://api.meetup.com/2/open_events?key=%(key)s&state=%(state)s&city=%(city)s&country=%(country)s&zip=%(zip)s&page=200&offset=%(offset)s' % params
                page, headers = self.retriever.fetch_data_and_headers(
                    url, raise_on_error=False)
                ratelimit_remaining = int(
                    headers.get('x-ratelimit-remaining', '9999'))
                while ratelimit_remaining <= 1:
                    # Apparently meetup says you have 1 hit remaining
                    # when they actually mean "this is the last one."
                    # This either raises an exception, or eventually returns new data.
                    ratelimit_reset = int(headers.get('x-ratelimit-reset', 0))
                    page, headers = self.handle_ratelimit_exceeded(
                        url, ratelimit_reset)
                    ratelimit_remaining = int(
                        headers.get('x-ratelimit-remaining', 0))
                    break
                while int(headers.get('status')) >= 400:
                    try:
                        body = simplejson.loads(page)
                        problem, code = body.get('problem'), body.get('code')
                    except simplejson.JSONDecodeError:
                        problem = page
                        code = ''
                    if code == 'limit':
                        # This either raises an exception, or eventually returns new data.
                        page, headers = self.handle_ratelimit_exceeded(url)
                        break
                    else:
                        msg = "Error %s. %s: %s" % (headers.get('status'),
                                                    code, problem)
                        self.logger.error(msg)
                        raise StopScraping(msg)
                zipcode_state[zipcode]['page'] += 1
                # Parse.
                encoding = headers.get('content-type',
                                       '').split('charset=')[-1]
                try:
                    decoded = page.decode(encoding)
                except LookupError:
                    decoded = UnicodeDammit(page, smartQuotesTo='html').unicode
                parsed = simplejson.loads(decoded)
                # Are there more pages?
                if not parsed['meta'].get('next'):
                    zipcode_state[zipcode]['done'] = True
                    self.logger.info("Finished zip code %s" % zipcode)
                yield parsed

            if not False in [
                    value['done'] for value in zipcode_state.values()
            ]:
                self.logger.info("Finished all zip codes")
                break
Пример #6
0
 def list_pages(self):
     result = self.fetch_data(self.url)
     if self.retriever.cache_hit:
         self.logger.info("HTTP cache hit, nothing new to do")
         raise StopScraping()
     yield result