def save_random_newsitem(schema, i, block): title = '%d Random %s %s' % (i, schema.name, uuid.uuid4()) print "Creating %r" % title item = NewsItem() item.title = title item.schema = schema item.description = gibberis.ch.freeform.random_text(get_text_corpus(), 300) item.url = 'http://example.com/%s/%d' % (schema.slug, i) date = random_datetime(7.0) item.pub_date = date item.item_date = date.date() item.location_name = block.pretty_name item.block = block try: item.location = block.geom.centroid except AttributeError: item.location = block.geom # Populate the attributes. attrs = {} for schemafield in schema.schemafield_set.all(): attrs[schemafield.name] = random_schemafield_value(schemafield) print "Added: %s at %s (%s)" % (item.title, item.location_name, item.location.wkt) # Need to save before we can have foreign keys from the attributes # or subclass. item.save() if attrs: item.attributes = attrs # That implicitly saves in the old model, but not the new. item.save()
def main(count): schema = 'local-news' locations = list(Location.objects.all()) random.shuffle(locations) try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(0) for i in range(int(count)): item = NewsItem() item.schema = schema item.title = '%d Random News %s' % (i, uuid.uuid1()) item.description = item.title + ' blah' * 100 item.url = 'http://example.com' # Random time between now and one week ago. date = datetime.datetime.now() - datetime.timedelta(random.uniform(-7.0, 0.0)) item.pub_date = item.item_date = date # Pick a random location from the ones we know. location = locations[i % len(locations)] item.location_object = location item.location_name = location.name # It would be cool to pick a random location within the bounds, # but that would take thought... use the center. try: item.location = location.location.centroid except AttributeError: print "whoops" continue print "Added: %s at %s (%s)" % (item.title, location.name, item.location.wkt) item.save()
def save_random_newsitem(schema, i, block): title = '%d Random %s %s' % (i, schema.name, uuid.uuid4()) print "Creating %r" % title item = NewsItem() item.title = title item.schema = schema item.description = gibberis.ch.freeform.random_text(get_text_corpus(), 300) item.url = 'http://example.com/%s/%d' % (schema.slug, i) date = random_datetime(7.0) item.pub_date = date item.item_date = date.date() item.location_name = block.pretty_name try: item.location = block.geom.centroid except AttributeError: item.location = block.geom # Populate the attributes. attrs = {} for schemafield in schema.schemafield_set.all(): attrs[schemafield.name] = random_schemafield_value(schemafield) print "Added: %s at %s (%s)" % (item.title, item.location_name, item.location.wkt) # Need to save before we can have foreign keys from the attributes # or subclass. item.save() if attrs: item.attributes = attrs # That implicitly saves in the old model, but not the new. item.save()
def main(): """ Download Calendar RSS feed and update database """ url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' parser = OptionParser() parser.add_option('-q', '--quiet', action="store_true", dest="quiet", default=False, help="no output") (options, args) = parser.parse_args() if len(args) > 0: return parser.error('script does not take any arguments') try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(1) feed = feedparser.parse(url) for entry in feed.entries: try: item = NewsItem.objects.get(title=entry.title, description=entry.description) status = "Updated" except NewsItem.DoesNotExist: item = NewsItem() status = "Added" try: item.location_name = entry.get('xcal_x-calconnect-street') item.schema = schema item.title = convert_entities(entry.title) item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): print "Skipping %r, bad location 0,0" % item.title continue item.save() if not options.quiet: print "%s: %s" % (status, item.title) except ValueError: if not options.quiet: print "unexpected error:", sys.exc_info()[1]
def main(): """ Download Calendar RSS feed and update database """ url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' parser = OptionParser() parser.add_option('-q', '--quiet', action="store_true", dest="quiet", default=False, help="no output") (options, args) = parser.parse_args() if len(args) > 0: return parser.error('script does not take any arguments') try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(0) feed = feedparser.parse(url) for entry in feed.entries: try: item = NewsItem.objects.get(title=entry.title, description=entry.description) status = "Updated" except NewsItem.DoesNotExist: item = NewsItem() status = "Added" try: item.schema = schema item.title = entry.title item.description = entry.description item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point( (float(entry['geo_long']), float(entry['geo_lat']))) item.save() if not options.quiet: print "%s: %s" % (status, item.title) except ValueError: if not options.quiet: print "unexpected error:", sys.exc_info()[1]
def main(argv=None): url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=boston&scope=bonzai' schema = 'local-news' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(0) f = feedparser.parse(url) geocoder = SmartGeocoder() for e in f.entries: try: item = NewsItem.objects.get(title=e.title, description=e.description) except NewsItem.DoesNotExist: item = NewsItem() item.schema = schema item.title = e.title item.description = e.description item.url = e.link #item.location_name = e['x-calconnect-street'] item.item_date = datetime.datetime(*e.updated_parsed[:6]) item.pub_date = datetime.datetime(*e.updated_parsed[:6]) try: if 'point' in e: x, y = e.point.split(' ') else: x, y = e.georss_point.split(' ') item.location = Point((float(y), float(x))) item.save() except: pass print "Added: %s" % item.title
def main(argv=None): url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=boston&scope=bonzai' schema = 'local-news' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(0) f = feedparser.parse(url) geocoder = SmartGeocoder() for e in f.entries: try: item = NewsItem.objects.get(title=e.title, description=e.description) except NewsItem.DoesNotExist: item = NewsItem() item.schema = schema item.title = e.title item.description = e.description item.url = e.link #item.location_name = e['x-calconnect-street'] item.item_date = datetime.datetime(*e.updated_parsed[:6]) item.pub_date = datetime.datetime(*e.updated_parsed[:6]) try: if 'point' in e: x,y = e.point.split(' ') else: x,y = e.georss_point.split(' ') item.location = Point((float(y), float(x))) item.save() except: pass print "Added: %s" % item.title
# custom field crime type crime_name = SchemaField() crime_name.schema = crime_report crime_name.pretty_name = "Crime Type" crime_name.pretty_plural_name = "Crime Types" crime_name.real_name = "varchar02" crime_name.name = "crime_type" crime_name.save() # custom field crime code crime_code = SchemaField() crime_code.schema = crime_report crime_code.pretty_name = "Crime Code" crime_code.pretty_plural_name = "Crime Codes" crime_code.real_name = "int01" crime_code.name = "crime_code" crime_code.save() # create a Crime Report! report = NewsItem() report.schema = crime_report report.title = "Hooligans causing disturbance downtown" report.location_name = "123 Fakey St." report.item_date = datetime.utcnow() report.pub_date = datetime.utcnow() report.description = "Blah Blah Blah" report.save() report.attributes['officer'] = "John Smith" report.attributes['crime_type'] = "Disturbing The Peace" report.attributes['crime_code'] = 187
def update(self, searchTerm, searchOffset): youtubeAPI = YouTubeAPI() numentries = 50 #How many results do we want the API to return logger.info("Starting YouTube_Scraper") response = youtubeAPI.runQuery(searchTerm, numentries, searchOffset) seencount = addcount = updatecount = 0 if response: for entry in response: seencount += 1 count = 0 while count != 9: if 'ns'+ str(count) + ':title' in entry: if entry['ns'+ str(count) + ':title'] != '': title = entry['ns'+ str(count) + ':title'] count += 1 else: logger.info("Skipping, as title is empty.") continue else: count += 1 try: newsItem = NewsItem.objects.get(title=title,schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: newsItem = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: newsItem.schema = self.schema count = 0 while count != 9: if 'ns'+ str(count) + ':description' in entry: if entry['ns'+ str(count) + ':description'] != '': newsItem.description = entry['ns'+ str(count) + ':description'] break else: logger.info("Skipping %r as description is empty." % (title)) continue else: count += 1 newsItem.url = entry['ns0:link'] count = 0 while count != 9: if 'ns'+ str(count) + ':title' in entry: if entry['ns'+ str(count) + ':title'] != '': newsItem.title = entry['ns'+ str(count) + ':title'] count += 1 else: logger.info("Skipping, as title is empty.") continue else: count += 1 # newsItem.item_date = datetime.datetime.now() count = 0 while count != 9: if 'ns'+ str(count) + ':published' in entry: yt_timedate = string.split(entry['ns'+ str(count) + ':published'],'T') break else: count += 1 date = yt_timedate[0] time = string.split(yt_timedate[1],'Z') formatted = date #date + " " + time[0] + "000" #Used to include timestamps newsItem.pub_date = datetime.datetime.now() newsItem.item_date = formatted.encode( "utf-8" ) _short_title = newsItem.title[:30] + '...' #newsItem.location_name = 'Kent' count = 0 while count != 9: if 'ns'+ str(count) + ':pos' in entry: long_lat = string.split(entry['ns'+ str(count) + ':pos']) break else: count += 1 newsItem.location = Point(float(long_lat[1]),float(long_lat[0])) x, y = float(long_lat[0]), float(long_lat[1]) if not intersects_metro_bbox(newsItem.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") newsItem.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not newsItem.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(newsItem.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) newsItem.location_name = block.pretty_name newsItem.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (newsItem.location.wkt, _short_title)) continue attributes_ = {} attributes_['photo_href'] = entry['ns0:thumb'] attributes_['videoID'] = entry['ns0:video_id'] attributes_['searchTerm'] = searchTerm newsItem.save() newsItem.attributes = attributes_ newsItem.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, newsItem.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info("YouTube_Scraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def update(url): logger.info("Scraping police reports") schema_slug = 'police' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type') f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title).strip() # The title will be used as the incident type. if title in SKIP_TYPES: logger.info("Skipping entry of type %s" % title) description = convert_entities(entry.summary) try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.pub_date = datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry.geo_long), float(entry.geo_lat))) item.location_name = get_element(entry, 'address') # parse call time ct = datetime.strptime(get_element(entry, 'calldatetime'), r"%m/%d/%Y %I:%M:%S %p") #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal()) #ct = ct.astimezone(tzutc()) item.item_date = ct item.save() # extra attributes try: item.attributes['calldatetime'] = ct except: pass try: item.attributes['innum'] = int(get_element(entry, 'innum')) except: pass for k in ['disp', 'aptlot', 'address']: try: item.attributes[k] = get_element(entry, k) except: pass # create a lookup based on the title, this is the closest thing to # a category that is available in the data. lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False) item.attributes['incident_type'] = lu.id if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc())) logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] item.block = result['block'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug("Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting EventsCalendarScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: def ns_get(element): # work around feedparser unpredictability. namespace, element = element.split(':') result = entry.get('%s_%s' % (namespace, element)) if result is None: result = entry.get(element) return result seencount += 1 title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'), ns_get('xcal:x-calconnect-street')) item.location_name = item.location_name.strip() item.schema = self.schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link start_dt = ns_get('xcal:dtstart') start_dt = dateutil.parser.parse(start_dt) # Upstream bug: They provide a UTC offset of +0000 which # means times in UTC, but they're actually times in # US/Eastern, so do *not* fix the zone. #start_dt = start_dt.astimezone(local_tz) item.item_date = start_dt.date() item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(ns_get('geo:long')), float(ns_get('geo:lat')))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() item.attributes['start_time'] = start_dt.time() end_dt = ns_get('xcal:dtend') or u'' if end_dt.strip(): end_dt = dateutil.parser.parse(end_dt.strip()) #end_dt = end_dt.astimezone(local_tz) item.attributes['end_time'] = end_dt.time() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def main(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get( 'georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') if point: x, y = point.split(' ') else: # Fall back on geocoding. text = item.title + ' ' + item.description try: x, y = quick_dirty_fallback_geocode(text, parse=True) except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def update(xmlfile, options): logger.info("Scraping University of Missouri police reports") if options.days == -1: start_date = datetime.date(1970, 1, 1) else: start_date = datetime.date.today() - datetime.timedelta(days=options.days) schema_slug = 'mupd' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) # We use iterparse() to avoid keeping the whole xml tree in memory, # this is a pretty big file. # See http://effbot.org/zone/element-iterparse.htm context = iter(lxml.etree.iterparse(xmlfile, events=('start', 'end'))) addcount = updatecount = 0 event, root = context.next() for event, elem in context: if event == 'end' and elem.tag == 'Table': category = cleanup(elem.findtext('Description')) lat = cleanup(elem.findtext('Lat')) lon = cleanup(elem.findtext('Lon')) item_date = cleanup(elem.findtext('CreateDatetime')) house_number = cleanup(elem.findtext('HouseNumber')) prefix = cleanup(elem.findtext('StreetPrefix')) street = cleanup(elem.findtext('StreetName')) streettype = cleanup(elem.findtext('StreetType')) suffix = cleanup(elem.findtext('StreetSuffix')) incident_number = cleanup(elem.findtext('IncidentNumber')) # We're done with this <Table> tag; clear the root element # that iterparse is building to avoid bloating memory with # empty elements. root.clear() else: continue if item_date: item_date = pyrfc3339.parse(item_date) if item_date.date() < start_date: logger.debug("Date %s is older than start date, skipping." % item_date) continue else: logger.debug("No parsable date, skipping.") continue location_parts = [house_number, prefix, street, streettype, suffix] location_name = ' '.join([s for s in location_parts if s]) if location_name: title = '%s: %s' % (location_name.title(), category.title()) else: title = category.title() try: lon, lat = float(lon), float(lat) location = Point(lon, lat) except ValueError: location = None if location and not intersects_metro_bbox(location): logger.info("SKIP %s (at %s), not within our metro area" % (title, (location.x, location.y))) continue cat_field = SchemaField.objects.get(schema=schema, name='category') cat_lookup = Lookup.objects.get_or_create_lookup( cat_field, category, category, "", False) attributes = {'incident_number': incident_number, 'category': cat_lookup.id} incident_number_field = SchemaField.objects.get(schema=schema, name='incident_number') try: item = NewsItem.objects.filter(schema__id=schema.id).by_attribute(incident_number_field, incident_number)[0] status = 'updated' except IndexError: item = NewsItem(pub_date=datetime.datetime.now()) status = 'added' except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched incident_number %s" % incident_number) continue logger.debug("%s %s" % (status, incident_number)) try: item.title = title item.schema = schema item.item_date = item_date.date() item.description = title # We don't have anything more verbose! item.location = location item.location_name = location_name item.save() item.attributes = attributes if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc())) logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
def update(): """ Download Calendar RSS feed and update database """ logger.info("Starting add_events") url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema) sys.exit(1) feed = feedparser.parse(url) addcount = updatecount = 0 for entry in feed.entries: title = convert_entities(entry.title).strip() # Putting 'event' in the title is redundant, ticket #227 if title.lower().startswith('event: '): title = title[7:] try: item = NewsItem.objects.get(title=title, schema__id=schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = entry.get('xcal_x-calconnect-street') or entry.get('x-calconnect-street') or u'' item.schema = schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("unexpected error:", sys.exc_info()[1]) logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting EventsCalendarScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: def ns_get(element): # work around feedparser unpredictability. namespace, element = element.split(':') result = entry.get('%s_%s' % (namespace, element)) if result is None: result = entry.get(element) return result seencount += 1 title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn( "Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = '%s %s' % ( ns_get('xcal:x-calconnect-venue-name'), ns_get('xcal:x-calconnect-street')) item.location_name = item.location_name.strip() item.schema = self.schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link start_dt = ns_get('xcal:dtstart') start_dt = dateutil.parser.parse(start_dt) # Upstream bug: They provide a UTC offset of +0000 which # means times in UTC, but they're actually times in # US/Eastern, so do *not* fix the zone. #start_dt = start_dt.astimezone(local_tz) item.item_date = start_dt.date() item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point( (float(ns_get('geo:long')), float(ns_get('geo:lat')))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() item.attributes['start_time'] = start_dt.time() end_dt = ns_get('xcal:dtend') or u'' if end_dt.strip(): end_dt = dateutil.parser.parse(end_dt.strip()) #end_dt = end_dt.astimezone(local_tz) item.attributes['end_time'] = end_dt.time() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info( "EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
params = {'deals_filter' : False, 'bounds' : " 41.0834917675, -81.39382852783203|41.206297513, -81.30878448486328", 'limit' : '20'} response = yelpAxes.search(params) for yelpPost in response['businesses']: newsItem = NewsItem() newsItem.schema = schema newsItem.description = yelpPost['snippet_text'] newsItem.rating = yelpPost['rating'] newsItem.url = yelpPost['url'] newsItem.title = yelpPost['name'] newsItem.item_date = datetime.now() newsItem.pub_date = datetime.now() newsItem.location_name = 'Kent' newsItem.location = Point((float (yelpPost['location']['coordinate']['longitude']), float (yelpPost['location']['coordinate']['latitude']))) newsItem.save() class YelpScraper(object): def __init__(self, schema_slug='reviews'): try: self.schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1)
def main(argv=None): if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema = 'local-news' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(1) f = feedparser.parse(url) for e in f.entries: try: item = NewsItem.objects.get(title=e.title, description=e.description) print "Already have %r (id %d)" % (item.title, item.id) except NewsItem.DoesNotExist: item = NewsItem() try: item.schema = schema item.title = convert_entities(e.title) item.description = convert_entities(e.description) item.url = e.link item.location_name = e.get('x-calconnect-street') or e.get('georss_featurename') item.item_date = datetime.datetime(*e.updated_parsed[:6]) item.pub_date = datetime.datetime(*e.updated_parsed[:6]) if 'point' in e: x,y = e.point.split(' ') elif 'georss_point' in e: x,y = e.georss_point.split(' ') else: text = item.title + ' ' + item.description from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(text, parse=True) if None in (x, y): print " couldn't geocode '%s...'" % item.title[:30] continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? print "Skipping %r as it has bad location 0,0" % item.title continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) print " Reverse-geocoded point to %r" % block.pretty_name item.location_name = block.pretty_name except reverse.ReverseGeocodeError: print " Failed to reverse geocode %s for %r" % (item.location.wkt, item.title) item.location_name = u'' item.save() print "Added: %s" % item.title except: print "Warning: couldn't save %r. Traceback:" % item.title import cStringIO, traceback f = cStringIO.StringIO() traceback.print_exc(file=f) msg = f.getvalue() print msg
def update(self): # # # Download Calendar RSS feed and update database # # logger.info("Starting KSUStudentProgrammingScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: seencount += 1 title = convert_entities(entry.title) title = foo(title, '', ' (') try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: # # # The actual rss feed elements are grabbed here # # itm_description = entry.description soup = BeautifulSoup(foo(itm_description,"</table><br />","<br /><br />")) locations = soup.findAll(text=True) location = locations[0].strip() place_grabber = places.place_grabber() grab_results = place_grabber(location) try: item.location = Place.objects.get(pretty_name=grab_results[0][2]).location item.location_name = Place.objects.get(pretty_name=grab_results[0][2]).pretty_name except: item.location = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.location item.location_name = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.pretty_name try: item.attributes['room'] = locations[1].strip().replace("Room: ","") except Exception as e: logger.info("Tried saving item.room, error: %s" % e) item.schema = self.schema item.title = title soup = BeautifulSoup(foo(itm_description,"<br /><br />","</td></tr>")) item.description = soup.findAll(text=True) item.description = item.description[0].strip() item.url = entry.link start_t = foo(itm_description,"Start Time:</b> </td><td>","</td>") start_t = dateutil.parser.parse(start_t) end_t = foo(itm_description,"End Time:</b> </td><td>","</td>") end_t = dateutil.parser.parse(end_t) end_dt = foo(itm_description,"End Date:</b> </td><td>","</td>") end_dt = dateutil.parser.parse(end_dt) item.item_date = dateutil.parser.parse(entry.category) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.attributes['start-time'] = start_t.time() item.attributes['end-time'] = end_t.time() item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info("KSUStudentProgrammingScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def _update_service_request(self, sreq): service_request_id = self._get_request_field(sreq, 'service_request_id') if not service_request_id: log.info("Skipping request with no request id (may be in progress)!") return # pull out the location first, if we can't do this, we don't want it. try: point = Point(float(sreq.find('long').text), float(sreq.find('lat').text), srid=4326) except: log.debug("Skipping request with invalid location (%s)" % service_request_id) return if self.bounds is not None: if not self.bounds.intersects(point): log.debug("Skipping request at %s, outside bounds" % point) return try: ni = NewsItem.objects.filter(schema=self.schema).by_attribute(self.service_request_id_field, service_request_id).all()[0] log.info('updating existing request %s' % service_request_id) except IndexError: # create the NewsItem ni = NewsItem(schema=self.schema) log.info('created new service request %s' % service_request_id) ni.title = self._get_request_field(sreq, 'service_name') ni.description = self._get_request_field(sreq, 'description') ni.location = point ni.location_name = self._get_request_field(sreq, 'address') # try to reverse geocde this point if not ni.location_name: try: block, distance = reverse_geocode(ni.location) ni.location_name = block.pretty_name except: log.debug("Failed to reverse geocode item %s" % service_request_id) # try to pull the requested_datetime into pubdate/itemdate # default to now. try: ni.pub_date = pyrfc3339.parse(sreq.find('requested_datetime').text) except: ni.pub_date = datetime.datetime.utcnow() log.info("Filling in current time for pub_date on item with no requested_datetime (%s)" % service_request_id) ni.item_date = datetime.date(ni.pub_date.year, ni.pub_date.month, ni.pub_date.day) if self.html_url_template: ni.url = self.html_url_template.replace('{id}', service_request_id) log.info('Assigning html url "%s" to %s' % (ni.url, service_request_id)) ni.save() ni.attributes['service_request_id'] = service_request_id # varchar fields for fieldname in ('request_id', 'service_code', 'address_id', 'media_url', 'status_notes', 'service_notice'): val = self._get_request_field(sreq, fieldname) if val != '': if len(val) < 4096: ni.attributes[fieldname] = val else: log.info("truncating value for %s (%s)" % (fieldname, val)) ni.attributes[fieldname] = val[0:4096] # text fields for fieldname in ('service_notice'): val = self._get_request_field(sreq, fieldname) if val != '': ni.attributes[fieldname] = val # datetime fields for fieldname in ('expected_datetime', 'requested_datetime'): val = self._get_request_field(sreq, fieldname) if val == '': continue # try to parse it try: ni.attributes[fieldname] = pyrfc3339.parse(val) except ValueError: # invalid date, just omit log.info('Omitting invalid datetime field %s = %s' % (fieldname, val)) pass # lookups for fieldname in ('service_name', 'agency_responsible', 'status'): val = self._get_request_field(sreq, fieldname) if val == '': ni.attributes[fieldname] = self._lookup_for(fieldname, 'Unknown') ni.attributes[fieldname] = self._lookup_for(fieldname, val)
def update(self): # # # Grab the Twitter feeds and start saving # # logger.info("Starting Twitter Scraper") response = self.search_twitter(self.hashtag) seencount = addcount = updatecount = 0 for entry in response['results']: seencount += 1 title = entry['text'].replace('RT ','') try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" continue except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: # # # The actual Twitter return results are grabbed here # # if any(entry['from_user'] in s for s in self.allowed_users) : #item.location_name = entry['location'] # print entry['entities']['hashtags'] # print entry['location'] item.schema = self.schema item.title = title item.description = entry['text'].replace('RT ','') item.location_name = 'student center' try : item.url = ("https://twitter.com/#!/%s/status/%s" % (entry['from_user'], entry['id_str'])) except : print "No url" rg = re.compile(self.re1,re.IGNORECASE|re.DOTALL) m = rg.search(item.description) if m: mmddyy1=m.group(1) print "("+mmddyy1+")"+"\n" else : mmddyy1 = entry['created_at'] item.item_date = dateutil.parser.parse(mmddyy1) item.pub_date = datetime.datetime.now() item.save() item.attributes = {'photo_href' : entry['profile_image_url'], 'author' : entry['entities']['user_mentions'][0]['screen_name'] } item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info("TwitterScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def update(): """ Download Calendar RSS feed and update database """ logger.info("Starting add_events") url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema) sys.exit(1) feed = feedparser.parse(url) addcount = updatecount = 0 for entry in feed.entries: title = convert_entities(entry.title).strip() # Putting 'event' in the title is redundant, ticket #227 if title.lower().startswith('event: '): title = title[7:] try: item = NewsItem.objects.get(title=title, schema__id=schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn( "Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = entry.get( 'xcal_x-calconnect-street') or entry.get( 'x-calconnect-street') or u'' item.schema = schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point( (float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("unexpected error:", sys.exc_info()[1]) logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
def update(url): schema_slug = 'sheriff' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type') try: innum_field = SchemaField.objects.get(schema=schema, name='innum') except SchemaField.DoesNotExist: logger.error( "SchemaField innum Does Not Exist for %s" % schema_slug) sys.exit(1) logger.info("Scraping %s" % schema.name) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: innum = int(get_element(entry, 'innum')) title = convert_entities(entry.title) description = convert_entities(entry.summary) try: item = NewsItem.objects.filter(schema=schema).by_attribute(innum_field, innum)[0] #url=item_url) status = 'updated' except IndexError: item = NewsItem() status = 'added' try: item.title = title item.schema = schema item.description = description try: item.location = Point((float(entry.geo_long), float(entry.geo_lat))) except: logger.info("Skipping item %s with no location information" % innum) item.location_name = get_element(entry, 'address') # this feed uses an invalidly formatted pubDate which # appears to be intended to express the time of the # incident, used for publication time as well. # 24 hour time. ct = datetime.strptime(entry.updated, r"%m/%d/%Y %H:%M:%S") #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal()) #ct = ct.astimezone(tzutc()) item.item_date = ct item.pub_date = ct.date() item.save() # extra attributes item.attributes['innum'] = innum for k in ['address']: try: item.attributes[k] = get_element(entry, k) except: pass # create a lookup based on the title, this is the closest thing to # a category that is available in the data. lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False) item.attributes['incident_type'] = lu.id if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc())) logger.info("Finished scraping %s: %d added, %d updated" % (schema.name, addcount, updatecount))
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: x, y = point.split(' ') if True: # Fall back on geocoding. text = item.title + ' ' + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except (GeocodingException, ParsingError): logger.debug("Geocoding exception on %r:" % text, exc_info=True) continue except: logger.exception('uncaught geocoder exception on %r\n' % addr) if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("Warning: couldn't save %r. Traceback:" % item.title) logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get( 'xCal_x-calconnect-street') or entry.get( 'x-calconnect-street') or entry.get( 'georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error( 'uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug( "Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them" ) item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.info( " Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))