def clean_list_record(self, record): record.title = convert_entities(record['title']) record.description = convert_entities(record['description']) # Don't know why, but some feeds have 'id' *instead* of 'link'. if record.get('id', '').startswith('http'): record['link'] = record['id'] # This tries GeoRSS, RDF Geo, xCal, ... point, location_name = self.get_point_and_location_name(record) _short_title = record['title'][:30] + '...' if not point: raise SkipRecord("couldn't geocode any addresses in item '%s...'" % _short_title) if not location_name: raise SkipRecord( "Skip, no location name and failed to reverse geocode %s for %r" % (point.wkt, _short_title)) if not intersects_metro_bbox(point): # Check if latitude, longitude seem to be reversed; I've # seen that in some bad feeds! reversed_loc = Point(point.y, point.x) if intersects_metro_bbox(reversed_loc): self.logger.info( "Got points in apparently reverse order, flipping them") point = reversed_loc else: raise SkipRecord("Skipping %r as %s,%s is out of bounds" % (_short_title, point.y, point.x)) record['location_name'] = location_name record['location'] = point return record
def main(): """ Download Calendar RSS feed and update database """ url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' parser = OptionParser() parser.add_option('-q', '--quiet', action="store_true", dest="quiet", default=False, help="no output") (options, args) = parser.parse_args() if len(args) > 0: return parser.error('script does not take any arguments') try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(1) feed = feedparser.parse(url) for entry in feed.entries: try: item = NewsItem.objects.get(title=entry.title, description=entry.description) status = "Updated" except NewsItem.DoesNotExist: item = NewsItem() status = "Added" try: item.location_name = entry.get('xcal_x-calconnect-street') item.schema = schema item.title = convert_entities(entry.title) item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): print "Skipping %r, bad location 0,0" % item.title continue item.save() if not options.quiet: print "%s: %s" % (status, item.title) except ValueError: if not options.quiet: print "unexpected error:", sys.exc_info()[1]
def text_from_html(html): """Remove ALL tags and return all plain text. """ text = preprocess_to_string(html, drop_tags=_html_droptags, drop_trees=_html_droptrees) text = convert_entities(text) return text
def clean_detail_record(self, record): if record == {}: # Parsing the detail page failed. return None if record['violation_points'].startswith('Not Available'): record['violation_points'] = 'N/A' record['followup_inspection'] = False else: if not record['violation_points'].isdigit(): raise ScraperBroken('Got odd violation points value %r' % record['violation_points']) record['followup_inspection'] = int( record['violation_points']) > 27 # Parse the violations from the HTML chunk. When we're done, # record['violation_list'] will be a (possibly empty) list of strings. vio_chunk = record.pop('violations') if vio_chunk == '': record['violation_list'] = [] else: vios = violation_list_re.findall(vio_chunk) if not vios: raise ScraperBroken( "Violation data not found for restaurant %s", record['restaurant_name']) record['violation_list'] = [ strip_tags(convert_entities(v.strip())) for v in vios ] # Remove the ZIP code from the address, as it complicates geocoding. record['address'] = re.sub(r'\s*\d{5}\s*$', '', record['address']) # Strip extra internal whitespace. record['address'] = re.sub(r'\s+', ' ', record['address']) return record
def clean_detail_record(self, record): if record == {}: # Parsing the detail page failed. return None if record['violation_points'].startswith('Not Available'): record['violation_points'] = 'N/A' record['followup_inspection'] = False else: if not record['violation_points'].isdigit(): raise ScraperBroken('Got odd violation points value %r' % record['violation_points']) record['followup_inspection'] = int(record['violation_points']) > 27 # Parse the violations from the HTML chunk. When we're done, # record['violation_list'] will be a (possibly empty) list of strings. vio_chunk = record.pop('violations') if vio_chunk == '': record['violation_list'] = [] else: vios = violation_list_re.findall(vio_chunk) if not vios: raise ScraperBroken("Violation data not found for restaurant %s", record['restaurant_name']) record['violation_list'] = [strip_tags(convert_entities(v.strip())) for v in vios] # Remove the ZIP code from the address, as it complicates geocoding. record['address'] = re.sub(r'\s*\d{5}\s*$', '', record['address']) # Strip extra internal whitespace. record['address'] = re.sub(r'\s+', ' ', record['address']) return record
def clean_list_record(self, record): # clean up a record dict # Item date, in timezone of the photo owner. # Not sure how to determine what that is, so we'll leave it. cleaned = {} cleaned['item_date'] = datetime.datetime.strptime(record['datetaken'], '%Y-%m-%d %H:%M:%S') cleaned['item_date'] = cleaned['item_date'].date() # Posted date, UTC timestamp. pub_date = datetime.datetime.fromtimestamp( float(record['dateupload']), utc) cleaned['pub_date'] = pub_date.astimezone(local_tz) description = record['description']['_content'] cleaned['description'] = convert_entities(description.strip()) cleaned['title'] = convert_entities(record['title']) x, y = record['longitude'], record['latitude'] cleaned['location'] = Point((float(x), float(y))) # Possibly we could figure out flickr's geo API and resolve # the photo's place_id and/or woeid to the place name? But # those are probably not specific enough; reverse-geocode # instead. try: block, distance = reverse_geocode(cleaned['location']) cleaned['location_name'] = block.pretty_name except ReverseGeocodeError: raise SkipRecord("Could not geocode location %s, %s" % (x, y)) # Don't think any of the urls returned by the API's "extras" # correspond to the page? not sure. cleaned['url'] = 'http://www.flickr.com/photos/%(owner)s/%(id)s' % record attributes = {} attributes['sourcename'] = 'Flickr' #attributes['photo_id'] = record['id'] attributes['user_id'] = record['owner'] attributes['username'] = record['ownername'] # Thumbnail. 'Small square' photos are 75x75. attributes['photo_href'] = record['url_m'] cleaned['_attributes'] = attributes return cleaned
def clean_list_record(self, record): # clean up a record dict # Item date, in timezone of the photo owner. # Not sure how to determine what that is, so we'll leave it. cleaned = {} cleaned['item_date'] = datetime.datetime.strptime( record['datetaken'], '%Y-%m-%d %H:%M:%S') cleaned['item_date'] = cleaned['item_date'].date() # Posted date, UTC timestamp. pub_date = datetime.datetime.fromtimestamp(float(record['dateupload']), utc) cleaned['pub_date'] = pub_date.astimezone(local_tz) description = record['description']['_content'] cleaned['description'] = convert_entities(description.strip()) cleaned['title'] = convert_entities(record['title']) x, y = record['longitude'], record['latitude'] cleaned['location'] = Point((float(x), float(y))) # Possibly we could figure out flickr's geo API and resolve # the photo's place_id and/or woeid to the place name? But # those are probably not specific enough; reverse-geocode # instead. try: block, distance = reverse_geocode(cleaned['location']) cleaned['location_name'] = block.pretty_name except ReverseGeocodeError: raise SkipRecord("Could not geocode location %s, %s" % (x, y)) # Don't think any of the urls returned by the API's "extras" # correspond to the page? not sure. cleaned[ 'url'] = 'http://www.flickr.com/photos/%(owner)s/%(id)s' % record attributes = {} attributes['sourcename'] = 'Flickr' #attributes['photo_id'] = record['id'] attributes['user_id'] = record['owner'] attributes['username'] = record['ownername'] # Thumbnail. 'Small square' photos are 75x75. attributes['photo_href'] = record['url_sq'] cleaned['_attributes'] = attributes return cleaned
def html_to_paragraph_list(tree): """ Given an HTML tree, removes HTML tags and returns a list of strings, with each string representing a paragraph/block. """ block_tags = set([ 'blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'li', 'p', 'td', 'th', 'tr' ]) drop_tags_only = set([ 'a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font', 'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic', 'u', 'ul', 'wbr' ]) drop_tags_and_contents = set([ 'applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input', 'link', 'map', 'meta', 'noscript', 'object', 'option', 'script', 'select', 'spacer', 'style', 'textarea', 'title' ]) elements_to_drop = [] for element in tree.getiterator(): if not isinstance(element.tag, basestring): # If it's a comment... element.drop_tag() continue if element.text and '\n' in element.text: element.text = element.text.replace('\n', ' ') if element.tail and '\n' in element.tail: element.tail = element.tail.replace('\n', ' ') if element.tag in block_tags: element.text = '\n' + (element.text or '') element.tail = '\n' + (element.tail or '') element.drop_tag() elif element.tag == 'br': element.tail = '\n' + (element.tail or '') element.drop_tag() elif element.tag in drop_tags_only: element.drop_tag() elif element.tag in drop_tags_and_contents: elements_to_drop.append(element) elif element.tag not in ('html', 'body'): # Unknown tag! element.drop_tag() for e in elements_to_drop: e.drop_tree() try: tree.body except IndexError: return '' else: new_html = etree.tostring( tree.body, method='html')[6:-7] # strip <body> and </body> new_html = convert_entities(new_html) return re.split(r'\s*\n+\s*', new_html.strip())
def text_from_html(html): """Remove ALL tags and return all plain text. """ text = preprocess_to_string(html, drop_tags=_html_droptags, drop_trees=_html_droptrees) if not text: # Maybe there was something there but not really HTML. if text and not isinstance(text, unicode): text = UnicodeDammit(html, isHTML=False).unicode.strip() else: text = u'' text = convert_entities(text) return text
def geocode_if_needed(self, point, location_name, address_text='', **kwargs): """ If either ``point`` or ``location_name`` is not set, try to geocode / reverse-geocode as needed to derive one from the other. Returns (point, location_name). If neither one is set, try to parse addresses out of ``address_text`` and derive both. Either value may be None if it can't be determined. Any other keyword args are passed to ``full_geocode()``. """ if not point: text = convert_entities(location_name or address_text) self.logger.debug("...Falling back on geocoding from '%s...'" % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = self.geocode(addr, **kwargs) if result is not None: point = result['point'] self.logger.debug("internally geocoded %r" % addr) # TODO: what if it's a Place? if not location_name: location_name = result['address'] break except: self.logger.exception( 'uncaught geocoder exception on %r\n' % addr) continue if point and not location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(point) self.logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) location_name = block.pretty_name except reverse.ReverseGeocodeError: location_name = None return (point, location_name)
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting ObituaryScraper") feed = feedparser.parse(self.url) total_created = 0 for entry in feed.entries: title = convert_entities(entry.title) try: created = self.parse_entry(entry, title) if created: total_created += 1 except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() break logger.info("Created %d of %d total" % (created, len(feed.entries)))
def geocode_if_needed(self, point, location_name, address_text='', **kwargs): """ If either ``point`` or ``location_name`` is not set, try to geocode / reverse-geocode as needed to derive one from the other. Returns (point, location_name). If neither one is set, try to parse addresses out of ``address_text`` and derive both. Either value may be None if it can't be determined. Any other keyword args are passed to ``full_geocode()``. """ if not point: text = convert_entities(location_name or address_text) self.logger.debug("...Falling back on geocoding from '%s...'" % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = self.geocode(addr, **kwargs) if result is not None: point = result['point'] self.logger.debug("internally geocoded %r" % addr) # TODO: what if it's a Place? if not location_name: location_name = result['address'] break except: self.logger.exception('uncaught geocoder exception on %r\n' % addr) continue if point and not location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(point) self.logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) location_name = block.pretty_name except reverse.ReverseGeocodeError: location_name = None return (point, location_name)
def html_to_paragraph_list(tree): """ Given an HTML tree, removes HTML tags and returns a list of strings, with each string representing a paragraph/block. """ block_tags = set(['blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'li', 'p', 'td', 'th', 'tr']) drop_tags_only = set(['a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font', 'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic', 'u', 'ul', 'wbr']) drop_tags_and_contents = set(['applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input', 'link', 'map', 'meta', 'noscript', 'object', 'option', 'script', 'select', 'spacer', 'style', 'textarea', 'title']) elements_to_drop = [] for element in tree.getiterator(): if not isinstance(element.tag, basestring): # If it's a comment... element.drop_tag() continue if element.text and '\n' in element.text: element.text = element.text.replace('\n', ' ') if element.tail and '\n' in element.tail: element.tail = element.tail.replace('\n', ' ') if element.tag in block_tags: element.text = '\n' + (element.text or '') element.tail = '\n' + (element.tail or '') element.drop_tag() elif element.tag == 'br': element.tail = '\n' + (element.tail or '') element.drop_tag() elif element.tag in drop_tags_only: element.drop_tag() elif element.tag in drop_tags_and_contents: elements_to_drop.append(element) elif element.tag not in ('html', 'body'): # Unknown tag! element.drop_tag() for e in elements_to_drop: e.drop_tree() try: tree.body except IndexError: return '' else: new_html = etree.tostring(tree.body, method='html')[6:-7] # strip <body> and </body> new_html = convert_entities(new_html) return re.split(r'\s*\n+\s*', new_html.strip())
def parse_entry(self, entry, title): try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) except NewsItem.DoesNotExist: item = NewsItem(title=title, schema=self.schema) description = convert_entities(entry.description) try: location, description = description.split(' -- ', 1) except ValueError: logger.error("Unable to parse description: %s", description) return item.url = entry.link item.description = description item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) try: item.location_name = self.geocoder.geocode(location) except geocoder.DoesNotExist: logger.error("Failed to geocode %s" % location) item.location_name = location created = item.pk is not None item.save() return created
def update(): """ Download Calendar RSS feed and update database """ logger.info("Starting add_events") url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema) sys.exit(1) feed = feedparser.parse(url) addcount = updatecount = 0 for entry in feed.entries: title = convert_entities(entry.title).strip() # Putting 'event' in the title is redundant, ticket #227 if title.lower().startswith('event: '): title = title[7:] try: item = NewsItem.objects.get(title=title, schema__id=schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = entry.get('xcal_x-calconnect-street') or entry.get('x-calconnect-street') or u'' item.schema = schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("unexpected error:", sys.exc_info()[1]) logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
def unique_fields(self, list_record): # not necessarily primary key, but for this script's purposes # these are the fields that in combination uniquely idenfity # an article. date = datetime.date(*list_record['updated_parsed'][:3]) precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we can set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) if 'Boston 24' in tags: # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # These are citywide though, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") description = list_record['content'][0]['value'] # TODO: we should have a stock 'clean up html' function. description = preprocess_to_string( description, drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'), drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'), drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target')) from ebdata.retrieval.utils import convert_entities description = convert_entities(description) #description = description.replace(' ', ' ').replace(' ', ' ') addrs = parse_addresses(description) if not addrs: self.logger.info("no addresses found in %r" % list_record['title']) location = None location_name = u'' for addr, unused in addrs: addr = addr.strip() try: from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(addr) if (x, y) != (None, None): location = Point((float(x), float(y))) location_name = addr.title() except: print "ugh, %r" % addr # XXX log something return dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, )
def update(url): logger.info("Scraping police reports") schema_slug = 'police' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type') f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title).strip() # The title will be used as the incident type. if title in SKIP_TYPES: logger.info("Skipping entry of type %s" % title) description = convert_entities(entry.summary) try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.pub_date = datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry.geo_long), float(entry.geo_lat))) item.location_name = get_element(entry, 'address') # parse call time ct = datetime.strptime(get_element(entry, 'calldatetime'), r"%m/%d/%Y %I:%M:%S %p") #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal()) #ct = ct.astimezone(tzutc()) item.item_date = ct item.save() # extra attributes try: item.attributes['calldatetime'] = ct except: pass try: item.attributes['innum'] = int(get_element(entry, 'innum')) except: pass for k in ['disp', 'aptlot', 'address']: try: item.attributes[k] = get_element(entry, k) except: pass # create a lookup based on the title, this is the closest thing to # a category that is available in the data. lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False) item.attributes['incident_type'] = lu.id if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc())) logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] item.block = result['block'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug("Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting EventsCalendarScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: def ns_get(element): # work around feedparser unpredictability. namespace, element = element.split(':') result = entry.get('%s_%s' % (namespace, element)) if result is None: result = entry.get(element) return result seencount += 1 title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'), ns_get('xcal:x-calconnect-street')) item.location_name = item.location_name.strip() item.schema = self.schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link start_dt = ns_get('xcal:dtstart') start_dt = dateutil.parser.parse(start_dt) # Upstream bug: They provide a UTC offset of +0000 which # means times in UTC, but they're actually times in # US/Eastern, so do *not* fix the zone. #start_dt = start_dt.astimezone(local_tz) item.item_date = start_dt.date() item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(ns_get('geo:long')), float(ns_get('geo:lat')))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() item.attributes['start_time'] = start_dt.time() end_dt = ns_get('xcal:dtend') or u'' if end_dt.strip(): end_dt = dateutil.parser.parse(end_dt.strip()) #end_dt = end_dt.astimezone(local_tz) item.attributes['end_time'] = end_dt.time() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def main(argv=None): if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema = 'local-news' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: print "Schema (%s): DoesNotExist" % schema sys.exit(1) f = feedparser.parse(url) for e in f.entries: try: item = NewsItem.objects.get(title=e.title, description=e.description) print "Already have %r (id %d)" % (item.title, item.id) except NewsItem.DoesNotExist: item = NewsItem() try: item.schema = schema item.title = convert_entities(e.title) item.description = convert_entities(e.description) item.url = e.link item.location_name = e.get('x-calconnect-street') or e.get('georss_featurename') item.item_date = datetime.datetime(*e.updated_parsed[:6]) item.pub_date = datetime.datetime(*e.updated_parsed[:6]) if 'point' in e: x,y = e.point.split(' ') elif 'georss_point' in e: x,y = e.georss_point.split(' ') else: text = item.title + ' ' + item.description from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(text, parse=True) if None in (x, y): print " couldn't geocode '%s...'" % item.title[:30] continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? print "Skipping %r as it has bad location 0,0" % item.title continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) print " Reverse-geocoded point to %r" % block.pretty_name item.location_name = block.pretty_name except reverse.ReverseGeocodeError: print " Failed to reverse geocode %s for %r" % (item.location.wkt, item.title) item.location_name = u'' item.save() print "Added: %s" % item.title except: print "Warning: couldn't save %r. Traceback:" % item.title import cStringIO, traceback f = cStringIO.StringIO() traceback.print_exc(file=f) msg = f.getvalue() print msg
def update(url): schema_slug = 'sheriff' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type') try: innum_field = SchemaField.objects.get(schema=schema, name='innum') except SchemaField.DoesNotExist: logger.error( "SchemaField innum Does Not Exist for %s" % schema_slug) sys.exit(1) logger.info("Scraping %s" % schema.name) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: innum = int(get_element(entry, 'innum')) title = convert_entities(entry.title) description = convert_entities(entry.summary) try: item = NewsItem.objects.filter(schema=schema).by_attribute(innum_field, innum)[0] #url=item_url) status = 'updated' except IndexError: item = NewsItem() status = 'added' try: item.title = title item.schema = schema item.description = description try: item.location = Point((float(entry.geo_long), float(entry.geo_lat))) except: logger.info("Skipping item %s with no location information" % innum) item.location_name = get_element(entry, 'address') # this feed uses an invalidly formatted pubDate which # appears to be intended to express the time of the # incident, used for publication time as well. # 24 hour time. ct = datetime.strptime(entry.updated, r"%m/%d/%Y %H:%M:%S") #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal()) #ct = ct.astimezone(tzutc()) item.item_date = ct item.pub_date = ct.date() item.save() # extra attributes item.attributes['innum'] = innum for k in ['address']: try: item.attributes[k] = get_element(entry, k) except: pass # create a lookup based on the title, this is the closest thing to # a category that is available in the data. lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False) item.attributes['incident_type'] = lu.id if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc())) logger.info("Finished scraping %s: %d added, %d updated" % (schema.name, addcount, updatecount))
def update(): """ Download Calendar RSS feed and update database """ logger.info("Starting add_events") url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema) sys.exit(1) feed = feedparser.parse(url) addcount = updatecount = 0 for entry in feed.entries: title = convert_entities(entry.title).strip() # Putting 'event' in the title is redundant, ticket #227 if title.lower().startswith('event: '): title = title[7:] try: item = NewsItem.objects.get(title=title, schema__id=schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn( "Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = entry.get( 'xcal_x-calconnect-street') or entry.get( 'x-calconnect-street') or u'' item.schema = schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point( (float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("unexpected error:", sys.exc_info()[1]) logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
def article_text_sections(tree): """ Given an HTML tree of a news article (or blog entry permalink), deduces which part of it is text and returns a list of lists of strings, with each string representing a paragraph and each list of strings representing a "section" of the page. """ # The basic algorithm here is to combine all text within the same block # (e.g., a <div>). MIN_NUM_PARAGRAPHS = 3 MIN_NUM_PUNCTUATED = 3 # In order for a paragraph to be counted toward MIN_NUM_PUNCTUATED, it must # have this number of characters. MIN_CHARS_IN_PARAGRAPH = 30 # If this many paragraphs with MIN_CHARS_IN_PARAGRAPH are included in the # section, then the section will be included, regardless of failing # MIN_PERCENTAGE_PUNCTUATED. NUM_PARAGRAPHS_SAFE_GUESS = 6 # In order for a section to be included in the result, at least this # percentage of paragraphs in the section must be punctuated. MIN_PERCENTAGE_PUNCTUATED = decimal.Decimal(".5") block_tags = set( ["blockquote", "dd", "div", "dt", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "li", "p", "td", "th", "tr"] ) drop_tags_only = set( [ "a", "abbr", "acronym", "b", "center", "dir", "dl", "em", "font", "form", "hr", "i", "label", "menu", "ol", "pre", "small", "span", "strong", "sub", "sup", "table", "tbody", "tfoot", "thead", "topic", "u", "ul", "wbr", ] ) drop_tags_and_contents = set( [ "applet", "area", "button", "embed", "img", "iframe", "head", "input", "link", "map", "meta", "noscript", "object", "option", "script", "select", "spacer", "style", "textarea", "title", ] ) layout_tags = set(["div", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "td", "th", "tr"]) is_open_tag = re.compile("^<[^/][^>]+>$").search is_close_tag = re.compile("^</[^>]+>$").search ignored_paragraphs = set( [ "del.icio.us", "digg", "email", "e-mail editor", "e-mail story", "no comments", "print", "print article", "printer-friendly", "printer version", "reprints", ] ) elements_to_drop = [] for element in tree.getiterator(): if not isinstance(element.tag, basestring): # If it's a comment... element.drop_tag() continue if element.text and "\n" in element.text: element.text = element.text.replace("\n", " ") if element.tail and "\n" in element.tail: element.tail = element.tail.replace("\n", " ") if element.tag in block_tags: element.text = "\n" + (element.text or "") element.tail = "\n" + (element.tail or "") elif element.tag == "br": element.tail = "\n" + (element.tail or "") element.drop_tag() elif element.tag in drop_tags_only: element.drop_tag() elif element.tag in drop_tags_and_contents: elements_to_drop.append(element) elif element.tag not in ("html", "body"): # Unknown tag! element.drop_tag() for e in elements_to_drop: e.drop_tree() for element in tree.getiterator(): if element.tag in block_tags: if element.tag in layout_tags: element.text = "\n<%s>\n%s\n" % (element.tag, (element.text or "")) element.tail = "\n</%s>\n%s\n" % (element.tag, (element.tail or "")) element.drop_tag() try: tree.body except IndexError: # In some cases, the article is missing a <body> tag, and tree.body # will result in an IndexError. Just skip these. return [] new_html = etree.tostring(tree.body, method="html") new_html = convert_entities(new_html) lines = re.split(r"\s*\n+\s*", new_html.strip()) result = [] sections = [] for line in lines: if is_open_tag(line): result.append([]) elif is_close_tag(line): last_bit = result.pop() if len(last_bit) >= MIN_NUM_PARAGRAPHS: sections.append(last_bit) else: # It's text, not a tag. try: result[-1].append(line) except IndexError: # No tags seen yet. result.append([line]) # Cut out the sections that don't contain enough punctuated sentences. final_sections = [] for section in sections: count = 0 to_delete = [] for i, paragraph in enumerate(section): if paragraph.lower() in ignored_paragraphs: to_delete.append(i) elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH: count += 1 percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section)) if count >= NUM_PARAGRAPHS_SAFE_GUESS or ( count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED ): for i in reversed(to_delete): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections
def article_text_sections(tree): """ Given an HTML tree of a news article (or blog entry permalink), deduces which part of it is text and returns a list of lists of strings, with each string representing a paragraph and each list of strings representing a "section" of the page. """ # The basic algorithm here is to combine all text within the same block # (e.g., a <div>). MIN_NUM_PARAGRAPHS = 3 MIN_NUM_PUNCTUATED = 3 # In order for a paragraph to be counted toward MIN_NUM_PUNCTUATED, it must # have this number of characters. MIN_CHARS_IN_PARAGRAPH = 30 # If this many paragraphs with MIN_CHARS_IN_PARAGRAPH are included in the # section, then the section will be included, regardless of failing # MIN_PERCENTAGE_PUNCTUATED. NUM_PARAGRAPHS_SAFE_GUESS = 6 # In order for a section to be included in the result, at least this # percentage of paragraphs in the section must be punctuated. MIN_PERCENTAGE_PUNCTUATED = decimal.Decimal('.5') block_tags = set(['blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'li', 'p', 'td', 'th', 'tr']) drop_tags_only = set(['a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font', 'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic', 'u', 'ul', 'wbr']) drop_tags_and_contents = set(['applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input', 'link', 'map', 'meta', 'noscript', 'object', 'option', 'script', 'select', 'spacer', 'style', 'textarea', 'title']) layout_tags = set(['div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'td', 'th', 'tr']) is_open_tag = re.compile('^<[^/][^>]+>$').search is_close_tag = re.compile('^</[^>]+>$').search ignored_paragraphs = set(['del.icio.us', 'digg', 'email', 'e-mail editor', 'e-mail story', 'no comments', 'print', 'print article', 'printer-friendly', 'printer version', 'reprints']) elements_to_drop = [] for element in tree.getiterator(): if not isinstance(element.tag, basestring): # If it's a comment... element.drop_tag() continue if element.text and '\n' in element.text: element.text = element.text.replace('\n', ' ') if element.tail and '\n' in element.tail: element.tail = element.tail.replace('\n', ' ') if element.tag in block_tags: element.text = '\n' + (element.text or '') element.tail = '\n' + (element.tail or '') elif element.tag == 'br': element.tail = '\n' + (element.tail or '') element.drop_tag() elif element.tag in drop_tags_only: element.drop_tag() elif element.tag in drop_tags_and_contents: elements_to_drop.append(element) elif element.tag not in ('html', 'body'): # Unknown tag! element.drop_tag() for e in elements_to_drop: e.drop_tree() for element in tree.getiterator(): if element.tag in block_tags: if element.tag in layout_tags: element.text = '\n<%s>\n%s\n' % (element.tag, (element.text or '')) element.tail = '\n</%s>\n%s\n' % (element.tag, (element.tail or '')) element.drop_tag() try: tree.body except IndexError: # In some cases, the article is missing a <body> tag, and tree.body # will result in an IndexError. Just skip these. return [] new_html = etree.tostring(tree.body, method='html') new_html = convert_entities(new_html) lines = re.split(r'\s*\n+\s*', new_html.strip()) result = [] sections = [] for line in lines: if is_open_tag(line): result.append([]) elif is_close_tag(line): last_bit = result.pop() if len(last_bit) >= MIN_NUM_PARAGRAPHS: sections.append(last_bit) else: # It's text, not a tag. try: result[-1].append(line) except IndexError: # No tags seen yet. result.append([line]) # Cut out the sections that don't contain enough punctuated sentences. final_sections = [] for section in sections: count = 0 to_delete = [] for i, paragraph in enumerate(section): if paragraph.lower() in ignored_paragraphs: to_delete.append(i) elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH: count += 1 percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section)) if count >= NUM_PARAGRAPHS_SAFE_GUESS or (count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED): for i in reversed(to_delete): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get( 'xCal_x-calconnect-street') or entry.get( 'x-calconnect-street') or entry.get( 'georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error( 'uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug( "Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them" ) item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.info( " Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def update(self): # # # Download Calendar RSS feed and update database # # logger.info("Starting KSUStudentProgrammingScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: seencount += 1 title = convert_entities(entry.title) title = foo(title, '', ' (') try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: # # # The actual rss feed elements are grabbed here # # itm_description = entry.description soup = BeautifulSoup(foo(itm_description,"</table><br />","<br /><br />")) locations = soup.findAll(text=True) location = locations[0].strip() place_grabber = places.place_grabber() grab_results = place_grabber(location) try: item.location = Place.objects.get(pretty_name=grab_results[0][2]).location item.location_name = Place.objects.get(pretty_name=grab_results[0][2]).pretty_name except: item.location = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.location item.location_name = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.pretty_name try: item.attributes['room'] = locations[1].strip().replace("Room: ","") except Exception as e: logger.info("Tried saving item.room, error: %s" % e) item.schema = self.schema item.title = title soup = BeautifulSoup(foo(itm_description,"<br /><br />","</td></tr>")) item.description = soup.findAll(text=True) item.description = item.description[0].strip() item.url = entry.link start_t = foo(itm_description,"Start Time:</b> </td><td>","</td>") start_t = dateutil.parser.parse(start_t) end_t = foo(itm_description,"End Time:</b> </td><td>","</td>") end_t = dateutil.parser.parse(end_t) end_dt = foo(itm_description,"End Date:</b> </td><td>","</td>") end_dt = dateutil.parser.parse(end_dt) item.item_date = dateutil.parser.parse(entry.category) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.attributes['start-time'] = start_t.time() item.attributes['end-time'] = end_t.time() item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info("KSUStudentProgrammingScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting EventsCalendarScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: def ns_get(element): # work around feedparser unpredictability. namespace, element = element.split(':') result = entry.get('%s_%s' % (namespace, element)) if result is None: result = entry.get(element) return result seencount += 1 title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn( "Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = '%s %s' % ( ns_get('xcal:x-calconnect-venue-name'), ns_get('xcal:x-calconnect-street')) item.location_name = item.location_name.strip() item.schema = self.schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link start_dt = ns_get('xcal:dtstart') start_dt = dateutil.parser.parse(start_dt) # Upstream bug: They provide a UTC offset of +0000 which # means times in UTC, but they're actually times in # US/Eastern, so do *not* fix the zone. #start_dt = start_dt.astimezone(local_tz) item.item_date = start_dt.date() item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point( (float(ns_get('geo:long')), float(ns_get('geo:lat')))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() item.attributes['start_time'] = start_dt.time() end_dt = ns_get('xcal:dtend') or u'' if end_dt.strip(): end_dt = dateutil.parser.parse(end_dt.strip()) #end_dt = end_dt.astimezone(local_tz) item.attributes['end_time'] = end_dt.time() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info( "EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def main(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get( 'georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') if point: x, y = point.split(' ') else: # Fall back on geocoding. text = item.title + ' ' + item.description try: x, y = quick_dirty_fallback_geocode(text, parse=True) except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: x, y = point.split(' ') if True: # Fall back on geocoding. text = item.title + ' ' + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except (GeocodingException, ParsingError): logger.debug("Geocoding exception on %r:" % text, exc_info=True) continue except: logger.exception('uncaught geocoder exception on %r\n' % addr) if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.exception("Warning: couldn't save %r. Traceback:" % item.title) logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def article_text_sections(tree): """ Given an HTML tree of a news article (or blog entry permalink), deduces which part of it is text and returns a list of lists of strings, with each string representing a paragraph and each list of strings representing a "section" of the page. """ # The basic algorithm here is to combine all text within the same block # (e.g., a <div>). MIN_NUM_PARAGRAPHS = 3 MIN_NUM_PUNCTUATED = 3 # In order for a paragraph to be counted toward MIN_NUM_PUNCTUATED, it must # have this number of characters. MIN_CHARS_IN_PARAGRAPH = 30 # If this many paragraphs with MIN_CHARS_IN_PARAGRAPH are included in the # section, then the section will be included, regardless of failing # MIN_PERCENTAGE_PUNCTUATED. NUM_PARAGRAPHS_SAFE_GUESS = 6 # In order for a section to be included in the result, at least this # percentage of paragraphs in the section must be punctuated. MIN_PERCENTAGE_PUNCTUATED = decimal.Decimal('.5') block_tags = set([ 'blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'li', 'p', 'td', 'th', 'tr' ]) drop_tags_only = set([ 'a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font', 'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic', 'u', 'ul', 'wbr' ]) drop_tags_and_contents = set([ 'applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input', 'link', 'map', 'meta', 'noscript', 'object', 'option', 'script', 'select', 'spacer', 'style', 'textarea', 'title' ]) layout_tags = set([ 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'td', 'th', 'tr' ]) is_open_tag = re.compile('^<[^/][^>]+>$').search is_close_tag = re.compile('^</[^>]+>$').search ignored_paragraphs = set([ 'del.icio.us', 'digg', 'email', 'e-mail editor', 'e-mail story', 'no comments', 'print', 'print article', 'printer-friendly', 'printer version', 'reprints' ]) elements_to_drop = [] for element in tree.getiterator(): if not isinstance(element.tag, basestring): # If it's a comment... element.drop_tag() continue if element.text and '\n' in element.text: element.text = element.text.replace('\n', ' ') if element.tail and '\n' in element.tail: element.tail = element.tail.replace('\n', ' ') if element.tag in block_tags: element.text = '\n' + (element.text or '') element.tail = '\n' + (element.tail or '') elif element.tag == 'br': element.tail = '\n' + (element.tail or '') element.drop_tag() elif element.tag in drop_tags_only: element.drop_tag() elif element.tag in drop_tags_and_contents: elements_to_drop.append(element) elif element.tag not in ('html', 'body'): # Unknown tag! element.drop_tag() for e in elements_to_drop: e.drop_tree() for element in tree.getiterator(): if element.tag in block_tags: if element.tag in layout_tags: element.text = '\n<%s>\n%s\n' % (element.tag, (element.text or '')) element.tail = '\n</%s>\n%s\n' % (element.tag, (element.tail or '')) element.drop_tag() try: tree.body except IndexError: # In some cases, the article is missing a <body> tag, and tree.body # will result in an IndexError. Just skip these. return [] new_html = etree.tostring(tree.body, method='html') new_html = convert_entities(new_html) lines = re.split(r'\s*\n+\s*', new_html.strip()) result = [] sections = [] for line in lines: if is_open_tag(line): result.append([]) elif is_close_tag(line): last_bit = result.pop() if len(last_bit) >= MIN_NUM_PARAGRAPHS: sections.append(last_bit) else: # It's text, not a tag. try: result[-1].append(line) except IndexError: # No tags seen yet. result.append([line]) # Cut out the sections that don't contain enough punctuated sentences. final_sections = [] for section in sections: count = 0 to_delete = [] for i, paragraph in enumerate(section): # Quotes seem to be causing the is_punctuated to fail paragraph = paragraph.replace('"', '').strip() paragraph = paragraph.replace("'", '').strip() if paragraph.lower() in ignored_paragraphs: to_delete.append(i) elif is_punctuated( paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH: count += 1 percent_punctuated = decimal.Decimal(count) / decimal.Decimal( len(section)) if count >= NUM_PARAGRAPHS_SAFE_GUESS or ( count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED): for i in reversed( to_delete ): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections