def main(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') if point: x, y = point.split(' ') else: # Fall back on geocoding. text = item.title + ' ' + item.description try: x, y = quick_dirty_fallback_geocode(text, parse=True) except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def main(argv=None): if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema = 'local-news' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(1) f = feedparser.parse(url) for e in f.entries: try: item = NewsItem.objects.get(title=e.title, description=e.description) print "Already have %r (id %d)" % (item.title, item.id) except NewsItem.DoesNotExist: item = NewsItem() try: item.schema = schema item.title = convert_entities(e.title) item.description = convert_entities(e.description) item.url = e.link item.location_name = e.get('x-calconnect-street') or e.get('georss_featurename') item.item_date = datetime.datetime(*e.updated_parsed[:6]) item.pub_date = datetime.datetime(*e.updated_parsed[:6]) if 'point' in e: x,y = e.point.split(' ') elif 'georss_point' in e: x,y = e.georss_point.split(' ') else: text = item.title + ' ' + item.description from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(text, parse=True) if None in (x, y): print " couldn't geocode '%s...'" % item.title[:30] continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? print "Skipping %r as it has bad location 0,0" % item.title continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) print " Reverse-geocoded point to %r" % block.pretty_name item.location_name = block.pretty_name except reverse.ReverseGeocodeError: print " Failed to reverse geocode %s for %r" % (item.location.wkt, item.title) item.location_name = u'' item.save() print "Added: %s" % item.title except: print "Warning: couldn't save %r. Traceback:" % item.title import cStringIO, traceback f = cStringIO.StringIO() traceback.print_exc(file=f) msg = f.getvalue() print msg
def unique_fields(self, list_record): # not necessarily primary key, but for this script's purposes # these are the fields that in combination uniquely idenfity # an article. date = datetime.date(*list_record['updated_parsed'][:3]) precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we can set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) if 'Boston 24' in tags: # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # These are citywide though, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") description = list_record['content'][0]['value'] # TODO: we should have a stock 'clean up html' function. description = preprocess_to_string( description, drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'), drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'), drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target')) from ebdata.retrieval.utils import convert_entities description = convert_entities(description) #description = description.replace(' ', ' ').replace(' ', ' ') addrs = parse_addresses(description) if not addrs: self.logger.info("no addresses found in %r" % list_record['title']) location = None location_name = u'' for addr, unused in addrs: addr = addr.strip() try: from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(addr) if (x, y) != (None, None): location = Point((float(x), float(y))) location_name = addr.title() except: print "ugh, %r" % addr # XXX log something return dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, )
def main(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get( 'georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') if point: x, y = point.split(' ') else: # Fall back on geocoding. text = item.title + ' ' + item.description try: x, y = quick_dirty_fallback_geocode(text, parse=True) except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))