def geocode(schema=None): """ Geocode NewsItems with null locations. If ``schema`` is provided, only geocode NewsItems with that particular schema slug. """ geocoder = SmartGeocoder() qs = NewsItem.objects.filter(location__isnull=True).order_by('-id') if schema is not None: print "Geocoding %s..." % schema qs = qs.filter(schema__slug=schema) else: print "Geocoding all ungeocoded newsitems..." geocoded_count = 0 not_found_count = 0 ambiguous_count = 0 parsing_error_count = 0 invalid_block_count = 0 for ni in qs.iterator(): loc_name = ni.location_name try: add = geocoder.geocode(loc_name) except InvalidBlockButValidStreet: print ' invalid block but valid street: %s' % loc_name invalid_block_count += 1 except AmbiguousResult, e: print ' ambiguous: %s' % loc_name ambiguous_count += 1 except GeocodingException, e: print ' not found: %s' % loc_name not_found_count += 1
class TestSmartGeocoder(django.test.TestCase): fixtures = ["wabash.yaml"] def setUp(self): self.geocoder = SmartGeocoder(use_cache=False) @mock.patch("ebpub.streets.models.get_metro") def test_address_geocoder(self, mock_get_metro): mock_get_metro.return_value = {"city_name": "CHICAGO", "multiple_cities": False} address = self.geocoder.geocode("200 S Wabash") self.assertEqual(address["city"], "Chicago") @mock.patch("ebpub.streets.models.get_metro") def test_address_geocoder_ambiguous(self, mock_get_metro): mock_get_metro.return_value = {"city_name": "CHICAGO", "multiple_cities": False} self.assertRaises(AmbiguousResult, self.geocoder.geocode, "220 Wabash") def test_address_geocoder_invalid_block(self): self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, "100000 S Wabash") @mock.patch("ebpub.streets.models.get_metro") def test_block_geocoder(self, mock_get_metro): mock_get_metro.return_value = {"city_name": "CHICAGO", "multiple_cities": False} address = self.geocoder.geocode("200 block of Wabash") self.assertEqual(address["city"], "Chicago") def test_intersection_geocoder(self): address = self.geocoder.geocode("Wabash and Jackson") self.assertEqual(address["city"], "CHICAGO")
def clean(self): loc_info = self.cleaned_data.get('location') if isinstance(loc_info, list): # olwidget wraps geometries up as lists in case there's several per map assert len(loc_info) == 1 loc_info = loc_info[0] if not loc_info: address = self.cleaned_data.get('address') if not address: self._append_error( 'location', u'Either an address or a location must be specified.') else: # try to geocode the address... try: geocoder = SmartGeocoder() addr = geocoder.geocode(address) loc_info = addr['point'] except AmbiguousResult: self._append_error( 'location', u'Address is ambiguous, please specify a point directly.' ) except GeocodingException: self._append_error( 'location', u'Unable to geocode address, please correct the address or specify a point directly.' ) # Again, olwidget expects these to be lists... loc_info = [loc_info] self.cleaned_data['location'] = loc_info return super(PlaceAdminForm, self).clean()
class BaseGeocoderTestCase(django.test.TestCase): fixtures = ['wabash.yaml'] def setUp(self): self.geocoder = SmartGeocoder(use_cache=False) @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} address = self.geocoder.geocode('200 S Wabash') self.assertEqual(address['city'], 'Chicago') @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder_ambiguous(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} self.assertRaises(AmbiguousResult, self.geocoder.geocode, '220 Wabash') def test_address_geocoder_invalid_block(self): self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, '100000 S Wabash') @mock.patch('ebpub.streets.models.get_metro') def test_block_geocoder(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} address = self.geocoder.geocode('200 block of Wabash') self.assertEqual(address['city'], 'Chicago') def test_intersection_geocoder(self): address = self.geocoder.geocode('Wabash and Jackson') self.assertEqual(address['city'], 'CHICAGO')
def __init__(self, use_cache=True): if not use_cache: self.retriever = Retriever(cache=None, sleep=self.sleep, timeout=self.timeout) else: self.retriever = Retriever(sleep=self.sleep, timeout=self.timeout) self.logger = logging.getLogger('eb.retrieval.%s' % self.logname) self.start_time = datetime.datetime.now() self._geocoder = SmartGeocoder()
def __init__(self, *args, **kwargs): if self.logname is None: self.logname = '%s.%s' % (settings.SHORT_NAME, self.schema_slugs[0]) super(NewsItemListDetailScraper, self).__init__(*args, **kwargs) self._schema_cache = None self._schemas_cache = None self._lookups_cache = None self._schema_fields_cache = None self._schema_field_mapping_cache = None self._geocoder = SmartGeocoder()
def full_geocode(query, search_places=True): """ Tries the full geocoding stack on the given query (a string): * Normalizes whitespace/capitalization * Searches the Misspelling table to corrects location misspellings * Searches the Location table * Failing that, searches the Place table (if search_places is True) * Failing that, uses the given geocoder to parse this as an address * Failing that, raises whichever error is raised by the geocoder -- except AmbiguousResult, in which case all possible results are returned Returns a dictionary of {type, result, ambiguous}, where ambiguous is True or False and type can be: * 'location' -- in which case result is a Location object. * 'place' -- in which case result is a Place object. (This is only possible if search_places is True.) * 'address' -- in which case result is an Address object as returned by geocoder.geocode(). * 'block' -- in which case result is a list of Block objects. If ambiguous is True, result will be a list of objects. """ query = normalize(query) # First, try correcting the spelling ("LAKEVIEW" -> "LAKE VIEW"). try: miss = Misspelling.objects.get(incorrect=query) except Misspelling.DoesNotExist: pass else: query = miss.correct # Search the Location table. try: loc = Location.objects.get(normalized_name=query) except Location.DoesNotExist: pass else: return {'type': 'location', 'result': loc, 'ambiguous': False} # Search the Place table, for stuff like "Sears Tower". if search_places: places = Place.objects.filter(normalized_name=query) if len(places) == 1: return {'type': 'place', 'result': places[0], 'ambiguous': False} elif len(places) > 1: return {'type': 'place', 'result': places, 'ambiguous': True} # Try geocoding this as an address. geocoder = SmartGeocoder() try: result = geocoder.geocode(query) except AmbiguousResult, e: return {'type': 'address', 'result': e.choices, 'ambiguous': True}
def geocode(schema=None): """ Geocode NewsItems with null locations. If ``schema`` is provided, only geocode NewsItems with that particular schema slug. """ geocoder = SmartGeocoder() qs = NewsItem.objects.filter(location__isnull=True).order_by('-id') if schema is not None: print "Geocoding %s..." % schema qs = qs.filter(schema__slug=schema) else: print "Geocoding all ungeocoded newsitems..." geocoded_count = 0 not_found_count = 0 ambiguous_count = 0 parsing_error_count = 0 invalid_block_count = 0 for ni in qs.iterator(): loc_name = ni.location_name try: add = geocoder.geocode(loc_name) except InvalidBlockButValidStreet: print ' invalid block but valid street: %s' % loc_name invalid_block_count += 1 except AmbiguousResult: print ' ambiguous: %s' % loc_name ambiguous_count += 1 except GeocodingException: print ' not found: %s' % loc_name not_found_count += 1 except ParsingError: print ' parse error: %s' % loc_name parsing_error_count += 1 except: raise else: ni.location = add['point'] ni.block = add['block'] ni.save() print '%s (%s)' % (loc_name, ni.item_url()) geocoded_count += 1 else: print "No NewsItems with null locations found" print "------------------------------------------------------------------" print "Geocoded: %s" % geocoded_count print "Not found: %s" % not_found_count print "Ambiguous: %s" % ambiguous_count print "Parse errors: %s" % parsing_error_count print "Invalid blocks: %s" % invalid_block_count
def full_geocode(query, search_places=True): """ Tries the full geocoding stack on the given query (a string): * Normalizes whitespace/capitalization * Searches the Misspelling table to corrects location misspellings * Searches the Location table * Failing that, searches the Place table (if search_places is True) * Failing that, uses the given geocoder to parse this as an address * Failing that, raises whichever error is raised by the geocoder -- except AmbiguousResult, in which case all possible results are returned Returns a dictionary of {type, result, ambiguous}, where ambiguous is True or False and type can be: * 'location' -- in which case result is a Location object. * 'place' -- in which case result is a Place object. (This is only possible if search_places is True.) * 'address' -- in which case result is an Address object as returned by geocoder.geocode(). * 'block' -- in which case result is a list of Block objects. If ambiguous is True, result will be a list of objects. """ # Search the Location table. try: canonical_loc = LocationSynonym.objects.get_canonical(query) loc = Location.objects.get(normalized_name=canonical_loc) except Location.DoesNotExist: pass else: logger.debug('geocoded %r to Location %s' % (query, loc)) return {'type': 'location', 'result': loc, 'ambiguous': False} # Search the Place table, for stuff like "Sears Tower". if search_places: canonical_place = PlaceSynonym.objects.get_canonical(query) places = Place.objects.filter(normalized_name=canonical_place) if len(places) == 1: logger.debug(u'geocoded %r to Place %s' % (query, places[0])) return {'type': 'place', 'result': places[0], 'ambiguous': False} elif len(places) > 1: logger.debug(u'geocoded %r to multiple Places: %s' % (query, unicode(places))) return {'type': 'place', 'result': places, 'ambiguous': True} # Try geocoding this as an address. geocoder = SmartGeocoder(use_cache=getattr(settings, 'EBPUB_CACHE_GEOCODER', False)) try: result = geocoder.geocode(query) except AmbiguousResult, e: logger.debug('Multiple addresses for %r' % query) return {'type': 'address', 'result': e.choices, 'ambiguous': True}
def add_newsitem(seed_url, seed_name, url, article_headline, article_date, name_excerpts): schema = Schema.objects.get(slug='news-articles') geocoder = SmartGeocoder() try: s = Seed.objects.get(url=seed_url) except Seed.DoesNotExist: s = Seed.objects.create( url=seed_url, base_url=seed_url, delay=0, depth=0, is_crawled=False, is_rss_feed=False, is_active='t', rss_full_entry=False, normalize_www=3, pretty_name=seed_name, schema=schema, autodetect_locations=True, guess_article_text=False, strip_noise=False, city='', ) try: p = Page.objects.get(url=url) except Page.DoesNotExist: html = UnicodeRetriever().fetch_data(url) p = Page.objects.create(seed=s, url=url, scraped_url=url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='') data_tuples = [] for location_name, excerpt in name_excerpts: point = geocoder.geocode(location_name) # Let exceptions bubble up. data_tuples.append( (location_name, point['point'], excerpt, point['block'])) return geotag_page(p.id, seed_name, schema, url, data_tuples, article_headline, article_date)
def add_newsitem(seed_url, seed_name, url, article_headline, article_date, name_excerpts): schema = Schema.objects.get(slug='news-articles') geocoder = SmartGeocoder() try: s = Seed.objects.get(url=seed_url) except Seed.DoesNotExist: s = Seed.objects.create( url=seed_url, base_url=seed_url, delay=0, depth=0, is_crawled=False, is_rss_feed=False, is_active='t', rss_full_entry=False, normalize_www=3, pretty_name=seed_name, schema=schema, autodetect_locations=True, guess_article_text=False, strip_noise=False, city='', ) try: p = Page.objects.get(url=url) except Page.DoesNotExist: html = UnicodeRetriever().get_html(url) p = Page.objects.create( seed=s, url=url, scraped_url=url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='' ) data_tuples = [] for location_name, excerpt in name_excerpts: point = geocoder.geocode(location_name) # Let exceptions bubble up. data_tuples.append((location_name, point['point'], excerpt, point['block'])) return geotag_page(p.id, seed_name, schema, url, data_tuples, article_headline, article_date)
def quick_dirty_fallback_geocode(addr, parse=True): """ Try to get SOME x,y even with bad blocks data, by falling back to external geocoders. """ from ebdata.nlp.addresses import parse_addresses from ebpub.geocoder import SmartGeocoder if parse: addrs = parse_addresses(addr) else: addrs = [addr] for addr, unused in addrs: try: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) return point.x, point.y except GeocodingException: logger.debug("internal geocoder failed on %r:\n" % addr) log_exception(level=logging.DEBUG) x, y = None, None # XXX Don't bother, external geocoding rarely gives us # anything inside Boston now that we have decent # blocks data. But I want to preserve this script for # now till we figure out what to do with geocoding # more generally continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() return None, None
class TestSmartGeocoder(django.test.TestCase): fixtures = ['wabash.yaml'] def setUp(self): self.geocoder = SmartGeocoder(use_cache=False) @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} result = self.geocoder.geocode('200 S Wabash Ave') self.assertEqual(result['city'], 'Chicago') self.assertEqual(result['address'], '200 S Wabash Ave.') @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder__wrong_suffix_works(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} result = self.geocoder.geocode('220 S Wabash St') self.assertEqual(result['address'], '220 S Wabash Ave.') # Or none at all. result = self.geocoder.geocode('220 S Wabash') self.assertEqual(result['address'], '220 S Wabash Ave.') @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder_ambiguous(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} # Ambiguous because of missing pre_dir. self.assertRaises(AmbiguousResult, self.geocoder.geocode, '220 Wabash') def test_address_geocoder_invalid_block(self): self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, '100000 S Wabash') @mock.patch('ebpub.streets.models.get_metro') def test_block_geocoder(self, mock_get_metro): mock_get_metro.return_value = {'city_name': 'CHICAGO', 'multiple_cities': False} address = self.geocoder.geocode('200 block of Wabash') self.assertEqual(address['city'], 'Chicago') def test_intersection_geocoder(self): address = self.geocoder.geocode('Wabash and Jackson') self.assertEqual(address['city'], 'CHICAGO')
class BaseGeocoderTestCase(django.test.TestCase): fixtures = ['wabash.yaml'] def setUp(self): self.geocoder = SmartGeocoder(use_cache=False) @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder(self, mock_get_metro): mock_get_metro.return_value = { 'city_name': 'CHICAGO', 'multiple_cities': False } address = self.geocoder.geocode('200 S Wabash') self.assertEqual(address['city'], 'Chicago') @mock.patch('ebpub.streets.models.get_metro') def test_address_geocoder_ambiguous(self, mock_get_metro): mock_get_metro.return_value = { 'city_name': 'CHICAGO', 'multiple_cities': False } self.assertRaises(AmbiguousResult, self.geocoder.geocode, '220 Wabash') def test_address_geocoder_invalid_block(self): self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, '100000 S Wabash') @mock.patch('ebpub.streets.models.get_metro') def test_block_geocoder(self, mock_get_metro): mock_get_metro.return_value = { 'city_name': 'CHICAGO', 'multiple_cities': False } address = self.geocoder.geocode('200 block of Wabash') self.assertEqual(address['city'], 'Chicago') def test_intersection_geocoder(self): address = self.geocoder.geocode('Wabash and Jackson') self.assertEqual(address['city'], 'CHICAGO')
class BaseGeocoderTestCase(unittest.TestCase): fixtures = ['wabash.yaml'] def setUp(self): self.geocoder = SmartGeocoder(use_cache=False) def test_address_geocoder(self): address = self.geocoder.geocode('200 S Wabash') self.assertEqual(address['city'], 'Chicago') def test_address_geocoder_ambiguous(self): self.assertRaises(AmbiguousResult, self.geocoder.geocode, '200 Wabash') def test_address_geocoder_invalid_block(self): self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, '100000 S Wabash') def test_block_geocoder(self): address = self.geocoder.geocode('200 block of Wabash') self.assertEqual(address['city'], 'Chicago') def test_intersection_geocoder(self): address = self.geocoder.geocode('Wabash and Jackson') self.assertEqual(address['city'], 'CHICAGO')
def clean(self): loc_info = self.cleaned_data.get('location') if isinstance(loc_info, list): # olwidget wraps geometries up as lists in case there's several per map assert len(loc_info) == 1 loc_info = loc_info[0] if not loc_info: address = self.cleaned_data.get('address') if not address: self._append_error('location', u'Either an address or a location must be specified.') else: # try to geocode the address... try: geocoder = SmartGeocoder() addr = geocoder.geocode(address) loc_info = addr['point'] except AmbiguousResult: self._append_error('location', u'Address is ambiguous, please specify a point directly.') except GeocodingException: self._append_error('location', u'Unable to geocode address, please correct the address or specify a point directly.') # Again, olwidget expects these to be lists... loc_info = [loc_info] self.cleaned_data['location'] = loc_info return super(PlaceAdminForm, self).clean()
def main(argv=None): url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=boston&scope=bonzai' schema = 'local-news' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(0) f = feedparser.parse(url) geocoder = SmartGeocoder() for e in f.entries: try: item = NewsItem.objects.get(title=e.title, description=e.description) except NewsItem.DoesNotExist: item = NewsItem() item.schema = schema item.title = e.title item.description = e.description item.url = e.link #item.location_name = e['x-calconnect-street'] item.item_date = datetime.datetime(*e.updated_parsed[:6]) item.pub_date = datetime.datetime(*e.updated_parsed[:6]) try: if 'point' in e: x, y = e.point.split(' ') else: x, y = e.georss_point.split(' ') item.location = Point((float(y), float(x))) item.save() except: pass print "Added: %s" % item.title
def fix_newsitem_coords(item, dry_run=True): """ Try to fix a (presumably bad) NewsItem geometry by reversing its coordinates, or reverse-geocoding if it has a location name; use whatever works. If dry_run=False, the item will be saved. """ if item.location is not None: loc = item.location.centroid print "Found %r outside bounds at %s, %s" % (item.title, loc.x, loc.y) else: loc = None print "NO location on %s: %s" % (item.schema.slug, item.title) fixed = False if item.location_name: from ebpub.geocoder import SmartGeocoder, AmbiguousResult try: result = SmartGeocoder().geocode(item.location_name) except AmbiguousResult, e: print "...%d choices, picking the first one" % len(e.choices) result = e.choices[0] except:
def update_from_query_params(self, request): """ Update the filters based on query parameters. After this is called, it's recommended to redirect to a normalized form of the URL, eg. self.sort(); self.make_url() This takes care to preserve query parameters that aren't used by any of the NewsitemFilters. """ # Make a mutable copy so we can leave only the params that FilterChain # doesn't know about. params = request.GET.copy() def pop_key(key): # request.GET.pop() returns a sequence. # We only want a single value, stripped. val = params.pop(key, [''])[0] return val.strip() address = pop_key('address') if address: xy_radius, block_radius, cookies_to_set = block_radius_value( request) params.pop('radius', None) result = None try: result = SmartGeocoder().geocode(address) except AmbiguousResult, e: raise BadAddressException(address, block_radius, address_choices=e.choices) except (GeocodingException, ParsingError): raise BadAddressException(address, block_radius, address_choices=())
# # everyblock is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with everyblock. If not, see <http://www.gnu.org/licenses/>. # from ebdata.retrieval.utils import locations_are_close from ebpub.db.models import NewsItem from ebpub.geocoder import SmartGeocoder, ParsingError, GeocodingException from django.contrib.gis.geos import Point geocoder = SmartGeocoder() THRESHOLD = 375 def fix_crime_geom(): qs = NewsItem.objects.filter(schema__slug='crime', location__isnull=False) count = qs.count() for i, ni in enumerate(qs.iterator()): print '# => Checking %s of %s' % (i, count) x, y = [float(n) for n in ni.attributes['xy'].split(';')] pt = Point((x, y)) pt.srid = 4326 location_name = ni.location_name.replace('XX', '01') try: result = geocoder.geocode(location_name) except (GeocodingException, ParsingError):
class NewsItemListDetailScraper(ListDetailScraper): """ A ListDetailScraper that saves its data into the NewsItem table. Subclasses are required to set the `schema_slugs` attribute. self.schemas lazily loads the list of Schema objects the first time it's accessed. It is a dictionary in the format {slug: Schema}. self.schema is available if schema_slugs has only one element. It's the Schema object. self.lookups lazily loads a dictionary of all SchemaFields with lookup=True. The dictionary is in the format {name: schemafield}. If schema_slug has more than one element, self.lookups is a dictionary in the format {schema_slug: {name: schemafield}}. self.schema_field_mapping lazily loads a dictionary of each SchemaField, mapping the name to the real_name. If schema_slug has more than one element, self.schema_field_mapping is a dictionary in the format {schema_slug: {name: real_name}}. """ schema_slugs = None logname = None def __init__(self, *args, **kwargs): if self.logname is None: self.logname = '%s.%s' % (settings.SHORT_NAME, self.schema_slugs[0]) super(NewsItemListDetailScraper, self).__init__(*args, **kwargs) self._schema_cache = None self._schemas_cache = None self._lookups_cache = None self._schema_fields_cache = None self._schema_field_mapping_cache = None self._geocoder = SmartGeocoder() # schemas, schema, lookups and schema_field_mapping are all lazily loaded # so that this scraper can be run (in raw_data(), xml_data() or # display_data()) without requiring a valid database to be set up. def _get_schemas(self): if self._schemas_cache is None: self._schemas_cache = dict([(s, Schema.objects.get(slug=s)) for s in self.schema_slugs]) return self._schemas_cache schemas = property(_get_schemas) def _get_schema(self): if self._schema_cache is None: if len(self.schema_slugs) > 1: raise AttributeError('self.schema is only available if len(schema_slugs) == 1') self._schema_cache = self.schemas[self.schema_slugs[0]] return self._schema_cache schema = property(_get_schema) def _get_lookups(self): if self._lookups_cache is None: lc = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.filter(is_lookup=True)])) for s in self.schemas.values()]) if len(self.schema_slugs) == 1: lc = lc[self.schema_slugs[0]] self._lookups_cache = lc return self._lookups_cache lookups = property(_get_lookups) def _get_schema_fields(self): if self._schema_fields_cache is None: sfs = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.all()])) for s in self.schemas.values()]) if len(self.schema_slugs) == 1: sfs = sfs[self.schema_slugs[0]] self._schema_fields_cache = sfs return self._schema_fields_cache schema_fields = property(_get_schema_fields) def _get_schema_field_mapping(self): if self._schema_field_mapping_cache is None: schema_objs = self.schemas.values() mapping = field_mapping([s.id for s in schema_objs]) fm = dict([(s.slug, mapping[s.id]) for s in schema_objs]) if len(self.schema_slugs) == 1: fm = fm[self.schema_slugs[0]] self._schema_field_mapping_cache = fm return self._schema_field_mapping_cache schema_field_mapping = property(_get_schema_field_mapping) def get_or_create_lookup(self, schema_field_name, name, code, description='', schema=None, make_text_slug=True): """ Returns the Lookup instance matching the given Schema slug, SchemaField name and Lookup.code, creating it (with the given name/code/description) if it doesn't already exist. If make_text_slug is True, then a slug will be created from the given name. If it's False, then the slug will be the Lookup's ID. """ if len(self.schema_slugs) > 1: sf = self.lookups[schema][schema_field_name] else: sf = self.lookups[schema_field_name] return Lookup.objects.get_or_create_lookup(sf, name, code, description, make_text_slug, self.logger) @transaction.commit_on_success def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: title item_date location_name For any other kwargs whose values aren't provided, this will use sensible defaults. kwargs may optionally contain a 'convert_to_block' boolean. If True, this will convert the given kwargs['location_name'] to a block level but will use the real (non-block-level) address for geocoding and Block association. attributes is a dictionary to use to populate this NewsItem's Attribute object. """ block = location = None if 'location' not in kwargs: location = self.geocode(kwargs['location_name']) if location: block = location['block'] location = location['point'] if kwargs.pop('convert_to_block', False): kwargs['location_name'] = address_to_block(kwargs['location_name']) # If the exact address couldn't be geocoded, try using the # normalized location name. if location is None: location = self.geocode(kwargs['location_name']) if location: block = location['block'] location = location['point'] # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) schema = schema or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=kwargs.get('location', location), location_name=kwargs['location_name'], location_object=kwargs.get('location_object', None), block=kwargs.get('block', block), ) ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s (total created in this scrape: %s)', ni.id, self.num_added) return ni @transaction.commit_on_success def update_existing(self, newsitem, new_values, new_attributes): """ Given an existing NewsItem and dictionaries new_values and new_attributes, determines which values and attributes have changed and saves the object and/or its attributes if necessary. """ newsitem_updated = False # First, check the NewsItem's values. for k, v in new_values.items(): if getattr(newsitem, k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v)) setattr(newsitem, k, v) newsitem_updated = True if newsitem_updated: newsitem.save() # Next, check the NewsItem's attributes. for k, v in new_attributes.items(): if newsitem.attributes[k] != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, newsitem.attributes[k], v)) newsitem.attributes[k] = v newsitem_updated = True if newsitem_updated: self.num_changed += 1 self.logger.debug('Total changed in this scrape: %s', self.num_changed) else: self.logger.debug('No changes to NewsItem %s detected', newsitem.id) def update(self): """ Updates the Schema.last_updated fields after scraping is done. """ self.num_added = 0 self.num_changed = 0 update_start = datetime.datetime.now() # We use a try/finally here so that the DataUpdate object is created # regardless of whether the scraper raised an exception. try: got_error = True super(NewsItemListDetailScraper, self).update() got_error = False finally: # Rollback, in case the database is in an aborted transaction. his # avoids the "psycopg2.ProgrammingError: current transaction is aborted, # commands ignored until end of transaction block" error. from django.db import connection connection._rollback() update_finish = datetime.datetime.now() # Clear the Schema cache, in case the schemas have been updated in the # database since we started the scrape. self._schemas_cache = self._schema_cache = None for s in self.schemas.values(): s.last_updated = datetime.date.today() s.save() DataUpdate.objects.create( schema=s, update_start=update_start, update_finish=update_finish, num_added=self.num_added, num_changed=self.num_changed, # None of our scrapers delete records yet, but we have the # plumbing in place here in case future scrapers need to do # that. num_deleted=0, num_skipped=self.num_skipped, got_error=got_error, ) def geocode(self, location_name): """ Tries to geocode the given location string, returning a Point object or None. """ try: return self._geocoder.geocode(location_name) except (GeocodingException, ParsingError): return None def safe_location(self, location_name, geom, max_distance=200): """ Returns a location (geometry) to use, given a location_name and geometry. This is used for data sources that publish both a geometry and a location_name -- we double-check that the geometry is within a certain `max_distance` from the geocoded location_name. If there's a discrepancy or if the location_name can't be geocoded, this returns None. """ location = self.geocode(location_name) if location is None: return None location_point = location['point'] if not location_point: return None location_point.srid = 4326 is_close, distance = locations_are_close(location_point, geom, max_distance) if not is_close: return None return geom
class BaseScraper(object): """ Base class for all scrapers in ebdata.retrieval.scrapers. """ logname = 'basescraper' sleep = 0 timeout = 20 def __init__(self, use_cache=True): if not use_cache: self.retriever = Retriever(cache=None, sleep=self.sleep, timeout=self.timeout) else: self.retriever = Retriever(sleep=self.sleep, timeout=self.timeout) self.logger = logging.getLogger('eb.retrieval.%s' % self.logname) self.start_time = datetime.datetime.now() self._geocoder = SmartGeocoder() def geocode(self, location_name, zipcode=None): """ Tries to geocode the given location string, returning a Point object or None. """ # Try to lookup the adress, if it is ambiguous, attempt to use # any provided zipcode information to resolve the ambiguity. # The zipcode is not included in the initial pass because it # is often too picky yeilding no results when there is a # legitimate nearby zipcode identified in either the address # or street number data. try: return self._geocoder.geocode(location_name) except AmbiguousResult as result: # try to resolve based on zipcode... if zipcode is None: self.logger.info( "Ambiguous results for address %s. (no zipcode to resolve dispute)" % (location_name, )) return None in_zip = [r for r in result.choices if r['zip'] == zipcode] if len(in_zip) == 0: self.logger.info( "Ambiguous results for address %s, but none in specified zipcode %s" % (location_name, zipcode)) return None elif len(in_zip) > 1: self.logger.info( "Ambiguous results for address %s in zipcode %s, guessing first." % (location_name, zipcode)) return in_zip[0] else: return in_zip[0] except (GeocodingException, ParsingError): self.logger.info( "Could not geocode location: %s: %s" % (location_name, traceback.format_exc())) return None def update(self): 'Run the scraper.' raise NotImplementedError() def fetch_data(self, *args, **kwargs): return self.retriever.fetch_data(*args, **kwargs) def get_html(self, *args, **kwargs): """An alias for fetch_data(). For backward compatibility. """ return self.fetch_data(*args, **kwargs) @classmethod def parse_html(cls, html): from lxml import etree from cStringIO import StringIO return etree.parse(StringIO(html), etree.HTMLParser()) @transaction.commit_on_success def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: title item_date location_name For any other kwargs whose values aren't provided, this will use sensible defaults. kwargs MAY have the following keys: zipcode - used to disambiguate geocoded locations kwargs may optionally contain a 'convert_to_block' boolean. If True, this will convert the given kwargs['location_name'] to a block level but will use the real (non-block-level) address for geocoding and Block association. attributes is a dictionary to use to populate this NewsItem's Attribute objects. """ location = kwargs.get('location') location_name = kwargs.get('location_name') assert location or location_name, "At least one of location or location_name must be provided" if location is None: location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode')) if location: location = location['point'] if kwargs.pop('convert_to_block', False): kwargs['location_name'] = address_to_block(kwargs['location_name']) # If the exact address couldn't be geocoded, try using the # normalized location name. if location is None: location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode')) if location: location = location['point'] # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=location, location_name=location_name, location_object=kwargs.get('location_object', None), ) if attributes is not None: ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added) return ni @transaction.commit_on_success def update_existing(self, newsitem, new_values, new_attributes): """ Given an existing NewsItem and dictionaries new_values and new_attributes, determines which values and attributes have changed and saves the object and/or its attributes if necessary. """ newsitem_updated = False # First, check the NewsItem's values. for k, v in new_values.items(): if isinstance(v, datetime.datetime) and v.tzinfo is not None: # Django datetime fields are not timezone-aware, so we # can't compare them without stripping the zone. v = v.astimezone(local_tz).replace(tzinfo=None) if getattr(newsitem, k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v)) setattr(newsitem, k, v) newsitem_updated = True if newsitem_updated: newsitem.save() else: self.logger.debug("No change to %s <%s>" % (newsitem.id, newsitem)) # Next, check the NewsItem's attributes. for k, v in new_attributes.items(): if isinstance(v, datetime.datetime) and v.tzinfo is not None: # Django datetime fields are not timezone-aware, so we # can't compare them without stripping the zone. v = v.astimezone(local_tz).replace(tzinfo=None) if newsitem.attributes.get(k) == v: continue elif k not in newsitem.attributes: self.logger.info('ID %s %s was missing, setting to %r' % (newsitem.id, k, v)) elif newsitem.attributes.get(k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, newsitem.attributes[k], v)) newsitem.attributes[k] = v newsitem_updated = True if newsitem_updated: self.num_changed += 1 self.logger.debug('Total changed in this scrape: %s', self.num_changed) else: self.logger.debug('No changes to NewsItem %s detected', newsitem.id) def create_or_update(self, old_record, attributes, **kwargs): """unified API for updating or creating a NewsItem. """ if old_record: self.update_existing(old_record, kwargs, attributes or {}) else: self.create_newsitem(attributes=attributes, **kwargs)
def update_from_request(self, filter_sf_dict): """Update the list of filters based on the request params. After this is called, it's recommended to redirect to a normalized form of the URL, which you can get via self.sort(); self.make_url() ``filter_sf_dict`` is a mapping of name -> SchemaField which have either is_filter or is_searchable True. We remove SchemaFields that we create filters for. (This is so that templates can display input widgets for the ones we're not already filtering by.) TODO: This should not bail out on the first error, it should do as much as possible and signal multiple errors. (Use the forms framework?) """ request, context = self.request, self.context qs = self.qs params = request.GET.copy() def pop_key(key, single=False): """ Pop the value(s) from params, treat it as a comma-separated list of values, and split that into a list. So ?foo=bar,baz is equivalent to ?foo=bar&foo=baz. If single==True, return only the first one; in the example we'd return 'bar'. Otherwise, by default, return the list; in the example we'd return ['bar', 'baz'] """ result = [] # Doesn't seem to be a way to get a list of values *and* # remove it in one call; so use both getlist() and pop(). values = params.getlist(key) or [u''] params.pop(key, None) for value in values: value = value.replace(u'+', u' ') # XXX does django do this already? values = [s.strip() for s in value.split(u',')] result.extend(values) result = [r for r in result if r] if single: return result[0] if result else u'' return result # IDs. ids = pop_key('id', single=False) if ids: self.replace('id', *ids) # Address. address = pop_key('address', single=True) if address: xy_radius, block_radius, cookies_to_set = block_radius_value( request) pop_key( 'radius') # Just to remove it, block_radius_value() used it. result = None try: result = SmartGeocoder().geocode(address) except AmbiguousResult, e: raise BadAddressException(address, block_radius, address_choices=e.choices) except (GeocodingException, ParsingError): raise BadAddressException(address, block_radius, address_choices=())
def import_csv_view(self, request): if not self.has_add_permission(request): raise PermissionDenied if request.method == 'GET': import_form = PlaceImportForm() if request.method == 'POST': import_form = PlaceImportForm(request.POST, request.FILES) if not import_form.is_bound or not import_form.is_valid(): return self._show_import_csv_form(request, import_form) # csv fields: # pretty_name, address, lat, lon, url, <synonym>, <synonym>, ... context = dict( errors=[], actions_taken=[], ) validated_rows = [] place_type = import_form.cleaned_data['place_type'] try: csvfile = import_form.cleaned_data['csv_file'] rows = csv.reader(csvfile) except: message = "Unable to read the specified CSV file" context['errors'].append(message) return self._show_import_csv_results(request, context) try: for row in rows: if len(row) < 2: message = "Line %d: Missing required fields." % rows.line_num context['errors'].append(message) continue synonyms = [] point = None place_url = '' pretty_name, address = [x.strip() for x in row[0:2]] if pretty_name == '': message = "Line %d: Empty name" % rows.line_num context['errors'].append(message) continue if len(row) > 2: try: lat, lon = row[2:4] if lat != '' or lon != '': lat = float(lat.strip()) lon = float(lon.strip()) point = geos.Point(lon, lat) if len(row) > 4: place_url = row[4] synonyms = [x.strip() for x in row[5:]] except ValueError: message = 'Line %d "%s": Invalid lat, lon' % ( rows.line_num, pretty_name) context['errors'].append(message) continue if point is None: if address == '': message = 'Line %d "%s": Address and lat,lon are both empty.' % ( rows.line_num, pretty_name) context['errors'].append(message) continue # try to geocode the address try: geocoder = SmartGeocoder() addr = geocoder.geocode(address) point = addr['point'] except AmbiguousResult: message = 'Line %d "%s": Address "%s" is ambiguous, please specify a point directly.' % ( rows.line_num, pretty_name, address) context['errors'].append(message) continue except GeocodingException: message = 'Line %d "%s": Unable to geocode address "%s", please correct the address or specify a point directly.' % ( rows.line_num, pretty_name, address) context['errors'].append(message) continue # phew! validated_rows.append( [pretty_name, address, point, place_url, synonyms]) except csv.Error, e: message = "Stopped on line %d: %s" % (rows.line_num, e) context['errors'].append(message) return self._show_import_csv_results(request, context)
def import_csv_view(self, request): if not self.has_add_permission(request): raise PermissionDenied if request.method == 'GET': import_form = PlaceImportForm() if request.method == 'POST': import_form = PlaceImportForm(request.POST, request.FILES) if not import_form.is_bound or not import_form.is_valid(): return self._show_import_csv_form(request, import_form) # csv fields: # pretty_name, address, lat, lon, url, <synonym>, <synonym>, ... context = dict( errors = [], actions_taken = [], ) validated_rows = [] place_type = import_form.cleaned_data['place_type'] try: csvfile = import_form.cleaned_data['csv_file'] rows = csv.reader(csvfile) except: message = "Unable to read the specified CSV file" context['errors'].append(message) return self._show_import_csv_results(request, context) try: for row in rows: if len(row) < 2: message = "Line %d: Missing required fields." % rows.line_num context['errors'].append(message) continue synonyms = [] point = None place_url = '' pretty_name, address = [x.strip() for x in row[0:2]] if pretty_name == '': message = "Line %d: Empty name" % rows.line_num context['errors'].append(message) continue if len(row) > 2: try: lat, lon = row[2:4] if lat != '' or lon != '': lat = float(lat.strip()) lon = float(lon.strip()) point = geos.Point(lon, lat) if len(row) > 4: place_url = row[4] synonyms = [x.strip() for x in row[5:]] except ValueError: message = 'Line %d "%s": Invalid lat, lon' % (rows.line_num, pretty_name) context['errors'].append(message) continue if point is None: if address == '': message = 'Line %d "%s": Address and lat,lon are both empty.' % (rows.line_num, pretty_name) context['errors'].append(message) continue # try to geocode the address try: geocoder = SmartGeocoder() addr = geocoder.geocode(address) point = addr['point'] except AmbiguousResult: message = 'Line %d "%s": Address "%s" is ambiguous, please specify a point directly.' % (rows.line_num, pretty_name, address) context['errors'].append(message) continue except GeocodingException: message = 'Line %d "%s": Unable to geocode address "%s", please correct the address or specify a point directly.' % (rows.line_num, pretty_name, address) context['errors'].append(message) continue # phew! validated_rows.append([pretty_name, address, point, place_url, synonyms]) except csv.Error, e: message = "Stopped on line %d: %s" % (rows.line_num, e) context['errors'].append(message) return self._show_import_csv_results(request, context)
class NewsItemListDetailScraper(ListDetailScraper): """ A ListDetailScraper that saves its data into the NewsItem table. Subclasses are required to set the `schema_slugs` attribute. Once you've set schema_slugs, there are a number of properties for conveniently accessing the relevant Schemas and SchemaFields: self.schemas lazily loads the list of Schema objects the first time it's accessed. It is a dictionary in the format {slug: Schema}. self.schema is available if schema_slugs has only one element. It's the Schema object. self.lookups lazily loads a dictionary of all SchemaFields with lookup=True. The dictionary is in the format {name: schemafield}. If schema_slugs has more than one element, self.lookups is a dictionary in the format {schema_slug: {name: schemafield}}. self.schema_fields lazily loads a dictionary of each SchemaField, mapping the name to the SchemaField object. If schema_slugs has more than one element, self.schema_fields is a dictionary in the format {schema_slug: {name: schema_field}}. self.schema_field_mapping lazily loads a dictionary of each SchemaField, mapping the name to the real_name. If schema_slugs has more than one element, self.schema_field_mapping is a dictionary in the format {schema_slug: {name: real_name}}. """ schema_slugs = None logname = None def __init__(self, *args, **kwargs): if self.logname is None: self.logname = '%s.%s' % (settings.SHORT_NAME, self.schema_slugs[0]) super(NewsItemListDetailScraper, self).__init__(*args, **kwargs) self._schema_cache = None self._schemas_cache = None self._lookups_cache = None self._schema_fields_cache = None self._schema_field_mapping_cache = None self._geocoder = SmartGeocoder() # schemas, schema, lookups and schema_field_mapping are all lazily loaded # so that this scraper can be run (in raw_data(), xml_data() or # display_data()) without requiring a valid database to be set up. @property def schemas(self): if self._schemas_cache is None: self._schemas_cache = dict([(s, Schema.objects.get(slug=s)) for s in self.schema_slugs]) return self._schemas_cache @property def schema(self): if self._schema_cache is None: if len(self.schema_slugs) > 1: raise AttributeError('self.schema is only available if len(schema_slugs) == 1') self._schema_cache = self.schemas[self.schema_slugs[0]] return self._schema_cache @property def lookups(self): if self._lookups_cache is None: lc = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.filter(is_lookup=True)])) for s in self.schemas.values()]) if len(self.schema_slugs) == 1: lc = lc[self.schema_slugs[0]] self._lookups_cache = lc return self._lookups_cache @property def schema_fields(self): if self._schema_fields_cache is None: sfs = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.all()])) for s in self.schemas.values()]) if len(self.schema_slugs) == 1: sfs = sfs[self.schema_slugs[0]] self._schema_fields_cache = sfs return self._schema_fields_cache @property def schema_field_mapping(self): if self._schema_field_mapping_cache is None: schema_objs = self.schemas.values() mapping = field_mapping([s.id for s in schema_objs]) fm = dict([(s.slug, mapping[s.id]) for s in schema_objs]) if len(self.schema_slugs) == 1: fm = fm[self.schema_slugs[0]] self._schema_field_mapping_cache = fm return self._schema_field_mapping_cache def get_or_create_lookup(self, schema_field_name, name, code, description='', schema=None, make_text_slug=True): """ Returns the Lookup instance matching the given Schema slug, SchemaField name and Lookup.code, creating it (with the given name/code/description) if it doesn't already exist. If make_text_slug is True, then a slug will be created from the given name. If it's False, then the slug will be the Lookup's ID. """ if len(self.schema_slugs) > 1: sf = self.lookups[schema][schema_field_name] else: sf = self.lookups[schema_field_name] return Lookup.objects.get_or_create_lookup(sf, name, code, description, make_text_slug, self.logger) @transaction.commit_on_success def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: title item_date location_name For any other kwargs whose values aren't provided, this will use sensible defaults. kwargs MAY have the following keys: zipcode - used to disambiguate geocoded locations kwargs may optionally contain a 'convert_to_block' boolean. If True, this will convert the given kwargs['location_name'] to a block level but will use the real (non-block-level) address for geocoding and Block association. attributes is a dictionary to use to populate this NewsItem's Attribute objects. """ block = kwargs.get('block') location = kwargs.get('location') location_name = kwargs.get('location_name') assert location or location_name, "At least one of location or location_name must be provided" if location is None: location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode')) if location: block = location['block'] location = location['point'] if kwargs.pop('convert_to_block', False): kwargs['location_name'] = address_to_block(kwargs['location_name']) # If the exact address couldn't be geocoded, try using the # normalized location name. if location is None: location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode')) if location: block = location['block'] location = location['point'] # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=location, location_name=location_name, location_object=kwargs.get('location_object', None), block=block, ) if attributes is not None: ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added) return ni @transaction.commit_on_success def update_existing(self, newsitem, new_values, new_attributes): """ Given an existing NewsItem and dictionaries new_values and new_attributes, determines which values and attributes have changed and saves the object and/or its attributes if necessary. """ newsitem_updated = False # First, check the NewsItem's values. for k, v in new_values.items(): if isinstance(v, datetime.datetime) and v.tzinfo is not None: # Django datetime fields are not timezone-aware, so we # can't compare them without stripping the zone. v = v.astimezone(local_tz).replace(tzinfo=None) if getattr(newsitem, k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v)) setattr(newsitem, k, v) newsitem_updated = True if newsitem_updated: newsitem.save() else: self.logger.debug("No change to %s <%s>" % (newsitem.id, newsitem)) # Next, check the NewsItem's attributes. for k, v in new_attributes.items(): if isinstance(v, datetime.datetime) and v.tzinfo is not None: # Django datetime fields are not timezone-aware, so we # can't compare them without stripping the zone. v = v.astimezone(local_tz).replace(tzinfo=None) if newsitem.attributes.get(k) == v: continue elif k not in newsitem.attributes: self.logger.info('ID %s %s was missing, setting to %r' % (newsitem.id, k, v)) elif newsitem.attributes.get(k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, newsitem.attributes[k], v)) newsitem.attributes[k] = v newsitem_updated = True if newsitem_updated: self.num_changed += 1 self.logger.debug('Total changed in this scrape: %s', self.num_changed) else: self.logger.debug('No changes to NewsItem %s detected', newsitem.id) def create_or_update(self, old_record, attributes, **kwargs): """unified API for updating or creating a NewsItem. """ if old_record: self.update_existing(old_record, kwargs, attributes or {}) else: self.create_newsitem(attributes=attributes, **kwargs) def update(self): """ Updates the Schema.last_updated fields after scraping is done. """ self.num_added = 0 self.num_changed = 0 update_start = datetime.datetime.now() # We use a try/finally here so that the DataUpdate object is created # regardless of whether the scraper raised an exception. try: got_error = True super(NewsItemListDetailScraper, self).update() got_error = False finally: # Rollback, in case the database is in an aborted # transaction. This avoids the "psycopg2.ProgrammingError: # current transaction is aborted, commands ignored until # end of transaction block" error. from django.db import connection connection._rollback() update_finish = datetime.datetime.now() # Clear the Schema cache, in case the schemas have been # updated in the database since we started the scrape. self._schemas_cache = self._schema_cache = None for s in self.schemas.values(): s.last_updated = datetime.date.today() s.save() DataUpdate.objects.create( schema=s, update_start=update_start, update_finish=update_finish, num_added=self.num_added, num_changed=self.num_changed, # None of our scrapers delete records yet, but we have the # plumbing in place here in case future scrapers need to do # that. num_deleted=0, num_skipped=self.num_skipped, got_error=got_error, ) def geocode(self, location_name, zipcode=None): """ Tries to geocode the given location string, returning a Point object or None. """ # Try to lookup the adress, if it is ambiguous, attempt to use # any provided zipcode information to resolve the ambiguity. # The zipcode is not included in the initial pass because it # is often too picky yeilding no results when there is a # legitimate nearby zipcode identified in either the address # or street number data. try: return self._geocoder.geocode(location_name) except AmbiguousResult as result: # try to resolve based on zipcode... if zipcode is None: self.logger.warning("Ambiguous results for address %s. (no zipcode to resolve dispute)" % (location_name, )) return None in_zip = [r for r in result.choices if r['zip'] == zipcode] if len(in_zip) == 0: self.logger.warning("Ambiguous results for address %s, but none in specified zipcode %s" % (location_name, zipcode)) return None if len(in_zip) > 1: self.logger.warning("Ambiguous results for address %s in zipcode %s, guessing first." % (location_name, zipcode)) return in_zip[0] else: return in_zip[0] except (GeocodingException, ParsingError): self.logger.warning("Could not geocode location: %s: %s" % (location_name, traceback.format_exc())) return None def safe_location(self, location_name, geom, max_distance=200): """ Returns a location (geometry) to use, given a location_name and geometry. This is used for data sources that publish both a geometry and a location_name -- we double-check that the geometry is within a certain `max_distance` from the geocoded location_name. If there's a discrepancy or if the location_name can't be geocoded, this returns None. """ location = self.geocode(location_name) if location is None: return None location_point = location['point'] if not location_point: return None location_point.srid = 4326 is_close, distance = locations_are_close(location_point, geom, max_distance) if not is_close: return None return geom def last_updated_time(self, schema=None): """ Returns a DateTime representing the last time we started scraping our schema(s). (We use start time rather than end time on the assumption that a few overlaps are preferable to missing updates.) """ schema = schema or self.schema try: last_update = DataUpdate.objects.order_by('update_start')[0] return last_update.update_start except IndexError: # Use the unix epoch (1970) as a stand-in for "never updated". return datetime.datetime.fromtimestamp(0)
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = [ 'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5' ] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = location['address'] block = location['block'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict( item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def auto_locations(paragraph_list, default_city=''): """ Given a list of strings, detects all valid, unique addresses and returns a tuple (result, report), where result is a list of tuples in the format (address, wkt, excerpt, block) and report is a string of what happened. If default_city is given, it will be used in the geocoding for detected addresses that don't specify a city. """ result, report = [], [] addresses_seen = set() geocoder = SmartGeocoder() for para in paragraph_list: for addy, city in parse_addresses(para): # Skip addresses if they have a city that's a known suburb. if city and Suburb.objects.filter( normalized_name=normalize(city)).count(): report.append('got suburb "%s, %s"' % (addy, city)) continue # Try geocoding the address. If a city was provided, first try # geocoding with the city, then fall back to just the address # (without the city). point = None attempts = [addy] if default_city: attempts.insert(0, '%s, %s' % (addy, default_city)) if city and city.lower() != default_city.lower(): attempts.insert(0, '%s, %s' % (addy, city)) for attempt in attempts: try: point = geocoder.geocode(attempt) break except AmbiguousResult: report.append('got ambiguous address "%s"' % attempt) # Don't try any other address attempts, because they only # get *more* ambiguous. Plus, the subsequent attempts could # be incorrect. For example, with this: # addy = '100 Broadway' # city = 'Manhattan' # default_city = 'Brooklyn' # There are multiple "100 Broadway" addresses in Manhattan, # so geocoding should fail at this point. It should not # roll back to try the default_city (Brooklyn). break except (DoesNotExist, InvalidBlockButValidStreet): report.append('got nonexistent address "%s"' % attempt) except ParsingError: report.append('got parsing error "%s"' % attempt) if point is None: continue # This address could not be geocoded. if point['address'] in addresses_seen: continue if len(para) > 300: try: excerpt = smart_excerpt(para, addy) except ValueError: excerpt = para else: excerpt = para result.append((addy, point['point'], excerpt, point['block'])) addresses_seen.add(point['address']) return (result, '; '.join(report))
class BaseScraper(object): """ Base class for all scrapers in ebdata.retrieval.scrapers. """ logname = 'basescraper' sleep = 0 timeout = 20 def __init__(self, use_cache=True): if not use_cache: self.retriever = Retriever(cache=None, sleep=self.sleep, timeout=self.timeout) else: self.retriever = Retriever(sleep=self.sleep, timeout=self.timeout) self.logger = logging.getLogger('eb.retrieval.%s' % self.logname) self.start_time = datetime.datetime.now() self._geocoder = SmartGeocoder() self.num_added = 0 self.num_changed = 0 def geocode(self, location_name, zipcode=None): """ Tries to geocode the given location string, returning a Point object or None. """ # Try to lookup the adress, if it is ambiguous, attempt to use # any provided zipcode information to resolve the ambiguity. # The zipcode is not included in the initial pass because it # is often too picky yeilding no results when there is a # legitimate nearby zipcode identified in either the address # or street number data. try: return self._geocoder.geocode(location_name) except AmbiguousResult as result: # try to resolve based on zipcode... if zipcode is None: self.logger.info( "Ambiguous results for address %s. (no zipcode to resolve dispute)" % (location_name, )) return None in_zip = [r for r in result.choices if r['zip'] == zipcode] if len(in_zip) == 0: self.logger.info( "Ambiguous results for address %s, but none in specified zipcode %s" % (location_name, zipcode)) return None elif len(in_zip) > 1: self.logger.info( "Ambiguous results for address %s in zipcode %s, guessing first." % (location_name, zipcode)) return in_zip[0] else: return in_zip[0] except (GeocodingException, ParsingError): self.logger.info( "Could not geocode location: %s: %s" % (location_name, traceback.format_exc())) return None def update(self): 'Run the scraper.' raise NotImplementedError() def fetch_data(self, *args, **kwargs): return self.retriever.fetch_data(*args, **kwargs) def get_html(self, *args, **kwargs): """An alias for fetch_data(). For backward compatibility. """ return self.fetch_data(*args, **kwargs) @classmethod def parse_html(cls, html): from lxml import etree from cStringIO import StringIO return etree.parse(StringIO(html), etree.HTMLParser()) @transaction.commit_on_success def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: title item_date location_name For any other kwargs whose values aren't provided, this will use sensible defaults. kwargs MAY have the following keys: zipcode - used to disambiguate geocoded locations kwargs may optionally contain a 'convert_to_block' boolean. If True, this will convert the given kwargs['location_name'] to a block level but will use the real (non-block-level) address for geocoding and Block association. attributes is a dictionary to use to populate this NewsItem's Attribute objects. """ location = kwargs.get('location') location_name = kwargs.get('location_name') assert location or location_name, "At least one of location or location_name must be provided" if location is None: location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode')) if location: location = location['point'] if kwargs.pop('convert_to_block', False): kwargs['location_name'] = address_to_block(kwargs['location_name']) # If the exact address couldn't be geocoded, try using the # normalized location name. if location is None: location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode')) if location: location = location['point'] # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=location, location_name=location_name, location_object=kwargs.get('location_object', None), ) if attributes is not None: ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added) return ni @transaction.commit_on_success def update_existing(self, newsitem, new_values, new_attributes): """ Given an existing NewsItem and dictionaries new_values and new_attributes, determines which values and attributes have changed and saves the object and/or its attributes if necessary. """ newsitem_updated = False # First, check the NewsItem's values. for k, v in new_values.items(): if isinstance(v, datetime.datetime) and v.tzinfo is not None: # Django datetime fields are not timezone-aware, so we # can't compare them without stripping the zone. v = v.astimezone(local_tz).replace(tzinfo=None) if getattr(newsitem, k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v)) setattr(newsitem, k, v) newsitem_updated = True if newsitem_updated: newsitem.save() else: self.logger.debug("No change to %s <%s>" % (newsitem.id, newsitem)) # Next, check the NewsItem's attributes. for k, v in new_attributes.items(): if isinstance(v, datetime.datetime) and v.tzinfo is not None: # Django datetime fields are not timezone-aware, so we # can't compare them without stripping the zone. v = v.astimezone(local_tz).replace(tzinfo=None) if newsitem.attributes.get(k) == v: continue elif k not in newsitem.attributes: self.logger.info('ID %s %s was missing, setting to %r' % (newsitem.id, k, v)) elif newsitem.attributes.get(k) != v: self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, newsitem.attributes[k], v)) newsitem.attributes[k] = v newsitem_updated = True if newsitem_updated: self.num_changed += 1 self.logger.debug('Total changed in this scrape: %s', self.num_changed) else: self.logger.debug('No changes to NewsItem %s detected', newsitem.id) def create_or_update(self, old_record, attributes, **kwargs): """unified API for updating or creating a NewsItem. """ if old_record: self.update_existing(old_record, kwargs, attributes or {}) else: self.create_newsitem(attributes=attributes, **kwargs)
def setUp(self): self.geocoder = SmartGeocoder(use_cache=False)
def auto_locations(paragraph_list, default_city=''): """ Given a list of strings, detects all valid, unique addresses and returns a tuple (result, report), where result is a list of tuples in the format (address, wkt, excerpt, block) and report is a string of what happened. If default_city is given, it will be used in the geocoding for detected addresses that don't specify a city. """ result, report = [], [] addresses_seen = set() geocoder = SmartGeocoder() for para in paragraph_list: for addy, city in parse_addresses(para): # Skip addresses if they have a city that's a known suburb. if city and Suburb.objects.filter(normalized_name=normalize(city)).count(): report.append('got suburb "%s, %s"' % (addy, city)) continue # Try geocoding the address. If a city was provided, first try # geocoding with the city, then fall back to just the address # (without the city). point = None attempts = [addy] if default_city: attempts.insert(0, '%s, %s' % (addy, default_city)) if city and city.lower() != default_city.lower(): attempts.insert(0, '%s, %s' % (addy, city)) for attempt in attempts: try: point = geocoder.geocode(attempt) break except AmbiguousResult: report.append('got ambiguous address "%s"' % attempt) # Don't try any other address attempts, because they only # get *more* ambiguous. Plus, the subsequent attempts could # be incorrect. For example, with this: # addy = '100 Broadway' # city = 'Manhattan' # default_city = 'Brooklyn' # There are multiple "100 Broadway" addresses in Manhattan, # so geocoding should fail at this point. It should not # roll back to try the default_city (Brooklyn). break except (DoesNotExist, InvalidBlockButValidStreet): report.append('got nonexistent address "%s"' % attempt) except ParsingError: report.append('got parsing error "%s"' % attempt) if point is None: continue # This address could not be geocoded. if point['address'] in addresses_seen: continue if len(para) > 300: try: excerpt = smart_excerpt(para, addy) except ValueError: excerpt = para else: excerpt = para result.append((addy, point['point'], excerpt, point['block'])) addresses_seen.add(point['address']) return (result, '; '.join(report))
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get( 'xCal_x-calconnect-street') or entry.get( 'x-calconnect-street') or entry.get( 'georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error( 'uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug( "Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them" ) item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.info( " Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get( 'georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: x, y = point.split(' ') if True: # Fall back on geocoding. text = item.title + ' ' + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except (GeocodingException, ParsingError): logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))