def quick_dirty_fallback_geocode(addr, parse=True): """ Try to get SOME x,y even with bad blocks data, by falling back to external geocoders. """ from ebdata.nlp.addresses import parse_addresses from ebpub.geocoder import SmartGeocoder if parse: addrs = parse_addresses(addr) else: addrs = [addr] for addr, unused in addrs: try: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) return point.x, point.y except GeocodingException: logger.debug("internal geocoder failed on %r:\n" % addr) log_exception(level=logging.DEBUG) x, y = None, None # XXX Don't bother, external geocoding rarely gives us # anything inside Boston now that we have decent # blocks data. But I want to preserve this script for # now till we figure out what to do with geocoding # more generally continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() return None, None
def quick_dirty_fallback_geocode(addr, parse=True): """ Try to get SOME x,y even with bad blocks data, by falling back to external geocoders. """ from ebdata.nlp.addresses import parse_addresses from ebpub.geocoder import SmartGeocoder if parse: addrs = parse_addresses(addr) else: addrs = [addr] for addr, unused in addrs: try: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) return point.x, point.y except GeocodingException: logger.debug("internal geocoder failed on %r:\n" % addr) log_exception(level=logging.DEBUG) x,y = None, None # XXX Don't bother, external geocoding rarely gives us # anything inside Boston now that we have decent # blocks data. But I want to preserve this script for # now till we figure out what to do with geocoding # more generally continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() return None, None
def get_address(text=""): _addresses = addresses.parse_addresses(text) a_dict = {'address': None, 'latitude': None, 'longitude': None} for address in _addresses: if a_dict['address']: # only use the first match for now continue a_dict = geocode(address[0], city=settings.CITY, state=settings.STATE) return a_dict
def quick_dirty_fallback_geocode(addr, parse=True): """ Try to get SOME x,y even with bad blocks data, by falling back to external geocoders. """ from ebdata.nlp.addresses import parse_addresses from ebpub.geocoder import SmartGeocoder if parse: addrs = parse_addresses(addr) else: addrs = [addr] for addr, unused in addrs: try: try: result = SmartGeocoder().geocode(addr) point = result['point'] print "YAY internally geocoded %r" % addr return point.x, point.y except: x,y = None, None sys.stderr.write("BOO internal geocoder failed on %r:\n" % addr) log_exception() # XXX Don't bother, external geocoding rarely gives us # anything inside Boston now that we have decent # blocks data. But I want to preserve this script for # now till we figure out what to do with geocoding # more generally continue if None in (x, y): # XXX log something # Other geocoders need to know the city addr += ', Boston, MA' from geopy import geocoders g = geocoders.Google(resource='maps', output_format='json') import urllib2 try: for unused, (lat, lon) in g.geocode(addr, exactly_one=False): print "YAY google geocoded %r" % addr return (lon, lat) except urllib2.HTTPError: # Rate throttled? Try another. pass except ValueError: # Bad JSON response? why? pass us = geocoders.GeocoderDotUS() for unused, (lat, lon) in us.geocode(addr, exactly_one=False): print "YAY geocoder.us geocoded %r" % addr return (lon, lat) except: sys.stderr.write( '===== uncaught geocoder exception on %r\n' % addr) log_exception() sys.stderr.write('======================\n') return None, None
def geocode_if_needed(self, point, location_name, address_text='', **kwargs): """ If either ``point`` or ``location_name`` is not set, try to geocode / reverse-geocode as needed to derive one from the other. Returns (point, location_name). If neither one is set, try to parse addresses out of ``address_text`` and derive both. Either value may be None if it can't be determined. Any other keyword args are passed to ``full_geocode()``. """ if not point: text = convert_entities(location_name or address_text) self.logger.debug("...Falling back on geocoding from '%s...'" % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = self.geocode(addr, **kwargs) if result is not None: point = result['point'] self.logger.debug("internally geocoded %r" % addr) # TODO: what if it's a Place? if not location_name: location_name = result['address'] break except: self.logger.exception( 'uncaught geocoder exception on %r\n' % addr) continue if point and not location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(point) self.logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) location_name = block.pretty_name except reverse.ReverseGeocodeError: location_name = None return (point, location_name)
def geocode_if_needed(self, point, location_name, address_text='', **kwargs): """ If either ``point`` or ``location_name`` is not set, try to geocode / reverse-geocode as needed to derive one from the other. Returns (point, location_name). If neither one is set, try to parse addresses out of ``address_text`` and derive both. Either value may be None if it can't be determined. Any other keyword args are passed to ``full_geocode()``. """ if not point: text = convert_entities(location_name or address_text) self.logger.debug("...Falling back on geocoding from '%s...'" % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = self.geocode(addr, **kwargs) if result is not None: point = result['point'] self.logger.debug("internally geocoded %r" % addr) # TODO: what if it's a Place? if not location_name: location_name = result['address'] break except: self.logger.exception('uncaught geocoder exception on %r\n' % addr) continue if point and not location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(point) self.logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) location_name = block.pretty_name except reverse.ReverseGeocodeError: location_name = None return (point, location_name)
def assertParses(self, text, expected): self.assertEqual(parse_addresses(text), expected)
def auto_locations(paragraph_list, default_city=''): """ Given a list of strings, detects all valid, unique addresses and returns a tuple (result, report), where result is a list of tuples in the format (address, wkt, excerpt, block) and report is a string of what happened. If default_city is given, it will be used in the geocoding for detected addresses that don't specify a city. """ result, report = [], [] addresses_seen = set() geocoder = SmartGeocoder() for para in paragraph_list: for addy, city in parse_addresses(para): # Skip addresses if they have a city that's a known suburb. if city and Suburb.objects.filter( normalized_name=normalize(city)).count(): report.append('got suburb "%s, %s"' % (addy, city)) continue # Try geocoding the address. If a city was provided, first try # geocoding with the city, then fall back to just the address # (without the city). point = None attempts = [addy] if default_city: attempts.insert(0, '%s, %s' % (addy, default_city)) if city and city.lower() != default_city.lower(): attempts.insert(0, '%s, %s' % (addy, city)) for attempt in attempts: try: point = geocoder.geocode(attempt) break except AmbiguousResult: report.append('got ambiguous address "%s"' % attempt) # Don't try any other address attempts, because they only # get *more* ambiguous. Plus, the subsequent attempts could # be incorrect. For example, with this: # addy = '100 Broadway' # city = 'Manhattan' # default_city = 'Brooklyn' # There are multiple "100 Broadway" addresses in Manhattan, # so geocoding should fail at this point. It should not # roll back to try the default_city (Brooklyn). break except (DoesNotExist, InvalidBlockButValidStreet): report.append('got nonexistent address "%s"' % attempt) except ParsingError: report.append('got parsing error "%s"' % attempt) if point is None: continue # This address could not be geocoded. if point['address'] in addresses_seen: continue if len(para) > 300: try: excerpt = smart_excerpt(para, addy) except ValueError: excerpt = para else: excerpt = para result.append((addy, point['point'], excerpt, point['block'])) addresses_seen.add(point['address']) return (result, '; '.join(report))
def unique_fields(self, list_record): # not necessarily primary key, but for this script's purposes # these are the fields that in combination uniquely idenfity # an article. date = datetime.date(*list_record['updated_parsed'][:3]) precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we can set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) if 'Boston 24' in tags: # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # These are citywide though, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") description = list_record['content'][0]['value'] # TODO: we should have a stock 'clean up html' function. description = preprocess_to_string( description, drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'), drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'), drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target')) from ebdata.retrieval.utils import convert_entities description = convert_entities(description) #description = description.replace(' ', ' ').replace(' ', ' ') addrs = parse_addresses(description) if not addrs: self.logger.info("no addresses found in %r" % list_record['title']) location = None location_name = u'' for addr, unused in addrs: addr = addr.strip() try: from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(addr) if (x, y) != (None, None): location = Point((float(x), float(y))) location_name = addr.title() except: print "ugh, %r" % addr # XXX log something return dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, )
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] item.block = result['block'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug("Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record["updated_parsed"][:3]) # Get the precinct from the tags. precincts = ["A1", "A7", "B2", "B3", "C11", "C6", "D14", "D4", "E13", "E18", "E5"] precinct = None tags = [t["term"] for t in list_record["tags"]] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record["summary"] full_description = list_record["content"][0]["value"] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record["title"], list_record["link"])) return location = None location_name = u"" block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = location["address"] block = location["block"] location = location["point"] break else: self.logger.info("no addresses geocoded in %r" % list_record["title"]) return kwargs = dict( item_date=date, location=location, location_name=location_name, title=list_record["title"], description=description, url=list_record["link"], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get( 'xCal_x-calconnect-street') or entry.get( 'x-calconnect-street') or entry.get( 'georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error( 'uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug( "Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them" ) item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.info( " Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = [ 'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5' ] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = location['address'] block = location['block'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict( item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def auto_locations(paragraph_list, default_city=''): """ Given a list of strings, detects all valid, unique addresses and returns a tuple (result, report), where result is a list of tuples in the format (address, wkt, excerpt, block) and report is a string of what happened. If default_city is given, it will be used in the geocoding for detected addresses that don't specify a city. """ result, report = [], [] addresses_seen = set() geocoder = SmartGeocoder() for para in paragraph_list: for addy, city in parse_addresses(para): # Skip addresses if they have a city that's a known suburb. if city and Suburb.objects.filter(normalized_name=normalize(city)).count(): report.append('got suburb "%s, %s"' % (addy, city)) continue # Try geocoding the address. If a city was provided, first try # geocoding with the city, then fall back to just the address # (without the city). point = None attempts = [addy] if default_city: attempts.insert(0, '%s, %s' % (addy, default_city)) if city and city.lower() != default_city.lower(): attempts.insert(0, '%s, %s' % (addy, city)) for attempt in attempts: try: point = geocoder.geocode(attempt) break except AmbiguousResult: report.append('got ambiguous address "%s"' % attempt) # Don't try any other address attempts, because they only # get *more* ambiguous. Plus, the subsequent attempts could # be incorrect. For example, with this: # addy = '100 Broadway' # city = 'Manhattan' # default_city = 'Brooklyn' # There are multiple "100 Broadway" addresses in Manhattan, # so geocoding should fail at this point. It should not # roll back to try the default_city (Brooklyn). break except (DoesNotExist, InvalidBlockButValidStreet): report.append('got nonexistent address "%s"' % attempt) except ParsingError: report.append('got parsing error "%s"' % attempt) if point is None: continue # This address could not be geocoded. if point['address'] in addresses_seen: continue if len(para) > 300: try: excerpt = smart_excerpt(para, addy) except ValueError: excerpt = para else: excerpt = para result.append((addy, point['point'], excerpt, point['block'])) addresses_seen.add(point['address']) return (result, '; '.join(report))
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except (GeocodingException, ParsingError): log_exception(level=logging.DEBUG) continue location_name = location['address'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: x, y = point.split(' ') if True: # Fall back on geocoding. text = item.title + ' ' + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except (GeocodingException, ParsingError): logger.debug("Geocoding exception on %r:" % text, exc_info=True) continue except: logger.exception('uncaught geocoder exception on %r\n' % addr) if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("Warning: couldn't save %r. Traceback:" % item.title) logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get( 'georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: x, y = point.split(' ') if True: # Fall back on geocoding. text = item.title + ' ' + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except (GeocodingException, ParsingError): logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))