Пример #1
0
    def process_item(self, item, spider):
        new_item = item.copy()

        # flatten location
        try:
            new_item['start_time'] = datetime.datetime.strftime(
                new_item['start_time'], '%Y-%m-%d %H:%M')
        except:
            pass
        try:
            new_item['end_time'] = datetime.datetime.strftime(
                new_item['end_time'], '%Y-%m-%d %H:%M')
        except:
            pass
        new_item['location_url'] = get_key(new_item, 'location.url')
        new_item['location_name'] = get_key(new_item, 'location.name')
        new_item['location_address'] = get_key(new_item, 'location.address')
        new_item['source_url'] = new_item.get('sources', [{
            'url': ''
        }])[0].get('url', '')
        new_item['source_note'] = new_item.get('sources', [{
            'note': ''
        }])[0].get('note', '')
        new_item['agency_name'] = spider.agency_name
        new_item['scraped_time'] = datetime.datetime.strftime(
            datetime.datetime.strptime(self.stamp, '%Y%m%d_%H%M'),
            '%Y-%m-%d %H:%M')
        new_item = {
            k: self._format_values(k, v)
            for k, v in new_item.items() if k in self.exporter.fields_to_export
        }

        self.exporter.export_item(new_item)
        return new_item
Пример #2
0
    def process_item(self, item, spider):
        # copy item; airtable-specific munging is happening here that breaks
        # opencivicdata standard

        if item.get('start_time') is None:
            spider.logger.debug(
                'AIRTABLE PIPELINE: Ignoring event without start_time {0}'.
                format(item['id']))
            return item

        dt = item['start_time']
        if dt < datetime.datetime.now(dt.tzinfo):
            spider.logger.debug(
                'AIRTABLE PIPELINE: Ignoring past event {0}'.format(
                    item['id']))
            return item

        time.sleep(randint(0, 3))  # to avoid rate limiting?

        new_item = item.copy()

        # flatten location
        new_item['location_url'] = get_key(new_item, 'location.url')
        new_item['location_name'] = get_key(new_item, 'location.name')
        new_item['location_address'] = get_key(new_item, 'location.address')
        new_item['location_latitude'] = get_key(
            new_item, 'location.coordinates.latitude')
        new_item['location_longitude'] = get_key(
            new_item, 'location.coordinates.longitude')
        new_item['agency_name'] = spider.long_name
        new_item['url'] = new_item.get('sources', [{
            'url': ''
        }])[0].get('url', '')

        new_item = {
            k: self._format_values(k, v)
            for k, v in new_item.items() if k in KEEP_FIELDS
        }

        try:
            self.save_item(new_item, spider)
            return item
        except HTTPError as e:
            spider.logger.error('HTTP error')
            spider.logger.error(e.response.content)
            spider.logger.exception('Original message')
            spider.logger.error(json.dumps(new_item, indent=4, sort_keys=True))
            raise DropItem('Could not save {0}'.format(new_item['id']))
        except Exception as e:
            spider.logger.exception('Unknown error')