def clean_list_record(self, record): # clean up a record dict venue = record.get('venue', {}) if not venue: raise SkipRecord("No venue") location_name_parts = [venue.get(key, '').strip() for key in ('address_1', 'address_2', 'city', 'state', 'zip')] location_name = ', '.join([p for p in location_name_parts if p]) event_time = datetime.datetime.fromtimestamp(record['time'] / 1000.0, local_tz) cleaned = {'title': text_from_html(record['name']), 'description': text_from_html(record.get('description', '')), 'location_name': location_name, 'location': Point(venue['lon'], venue['lat']), 'url': record['event_url'], 'item_date': event_time.date(), } attributes = {'venue_phone': venue.get('phone', ''), 'venue_name': text_from_html(venue.get('name', '')), 'start_time': event_time.time(), 'group_name': record['group']['name'], } cleaned['_attributes'] = attributes return cleaned
def clean_list_record(self, record): # clean up a record dict venue = record.get('venue', {}) if not venue: raise SkipRecord("No venue") location_name_parts = [ venue.get(key, '').strip() for key in ('address_1', 'address_2', 'city', 'state', 'zip') ] location_name = ', '.join([p for p in location_name_parts if p]) event_time = datetime.datetime.fromtimestamp(record['time'] / 1000.0, local_tz) cleaned = { 'title': text_from_html(record['name']), 'description': text_from_html(record.get('description', '')), 'location_name': location_name, 'location': Point(venue['lon'], venue['lat']), 'url': record['event_url'], 'item_date': event_time.date(), } attributes = { 'venue_phone': venue.get('phone', ''), 'venue_name': text_from_html(venue.get('name', '')), 'start_time': event_time.time(), 'group_name': record['group']['name'], } cleaned['_attributes'] = attributes return cleaned
def clean_list_record(self, record): date = datetime.date(*record['updated_parsed'][:3]) description = record['summary'] # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. # First we'll need some suitable text; throw away HTML tags. full_description = record['content'][0]['value'] full_description = text_from_html(full_description) # This method on the RssListDetailScraper does the rest. location, location_name = self.get_point_and_location_name( record, address_text=full_description) if not (location or location_name): locs = self.grabber(full_description) if locs: location_name = locs[0][2] location = Location.objects.get(name=location_name).location if not (location or location_name): raise SkipRecord("No location or location_name") cleaned = { 'item_date': date, 'location': location, 'location_name': location_name, 'title': record['title'], 'description': description, 'url': record['link'] } return cleaned
def save(self, old_record, list_record, detail_record): # This gets called once all parsing and cleanup is done. # It looks a lot like our 'expedient hack' code above. # We can ignore detail_record since has_detail is False. date = datetime.date(*list_record['updated_parsed'][:3]) description = text_from_html(list_record['summary']) # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. # First we'll need some suitable text; throw away HTML tags. # full_description = list_record['content'][0]['value'] # full_description = text_from_html(full_description) grabber = places.location_grabber() addrs = grabber(description) # printing articles title for debugging # print list_record['title'] if not addrs: addrs = grabber(list_record['title']) if not addrs: self.logger.info("no addresses found") return location = None location_name = u'' block = None # Ready to geocode. If we had one location_name to try, # this could be done automatically in create_or_update(), but # we have multiple possible location_names. for l, r, name in addrs: #addr = addr.strip() try: locationSyn = LocationSynonym.objects.get(pretty_name = name) location = Location.objects.get(name = locationSyn.location).location except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = name # block = location['block'] # location = location['point'] break if location is None: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, description=description, title=list_record['title'], url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def clean_list_record(self, record): if record['title'].startswith(u'Boston 24'): # We don't include the summary posts. # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for the aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # Or maybe not: these are citywide, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") raise SkipRecord date = datetime.date(*record['updated_parsed'][:3]) description = record['summary'] # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. full_description = record['content'][0]['value'] full_description = text_from_html(full_description) location, location_name = self.get_point_and_location_name( record, address_text=full_description) if not (location or location_name): raise SkipRecord("No location or location_name") # Get the precinct from the tags. precincts = ['A1', 'A15', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] tags = [t['term'] for t in record['tags']] precinct = None for tag in tags: if tag in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. For now we just save it as an attribute. precinct = tag break attributes = {} if precinct: precinct = self.get_or_create_lookup('precinct', precinct, precinct) attributes['precinct'] = precinct.id else: raise SkipRecord("no precinct found in tags %r" % tags) cleaned = dict(item_date=date, location=location, location_name=location_name, title=record['title'], description=description, url=record['link'], attributes=attributes, ) return cleaned
def clean_detail_record(self, detail_record): detail_record = detail_record.copy() detail_record['description'] = text_from_html(detail_record['description']) detail_record['url'] = detail_record.pop('link') detail_record['item_date'] = datetime.date(*detail_record.pop('updated_parsed')[:3]) for key in ('updated', 'title_detail', 'summary_detail', 'guidislink', 'summary', 'links', 'id'): del detail_record[key] return detail_record
def parse_list(self, page): """ yields a dictionary of data for each record on the page. See list_detail.py. """ tree = self.parse_html(page) for review in tree.findall("//*[@typeof='v:Review']"): url = review.find(".//a[@class='permalink']").attrib['href'] if url.startswith('/'): url = self.base_url + url rating = review.find(".//*[@property='v:rating']").text rating = rating.split(':')[-1].strip() reviewer = review.find(".//*[@property='v:reviewer']").text reviewed = review.find(".//*[@property='v:itemreviewed']").text description = review.find(".//*[@property='v:description']/p") description = getattr(description, 'text', '') if description: description = text_from_html(description) else: self.logger.info("Blank description, skipping...") continue date = review.find(".//*[@property='v:dtreviewed']").text date = parse_date(date, '%B %d, %Y') if date < self.start_date: raise StopScraping("Reached cutoff date %s" % self.start_date) data = { 'url': url, 'title': reviewed, 'description': description, 'item_date': date, '_attributes': { 'rating': rating, 'reviewer': reviewer, 'business_name': reviewed, }, } yield data
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except (GeocodingException, ParsingError): log_exception(level=logging.DEBUG) continue location_name = location['address'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record["updated_parsed"][:3]) # Get the precinct from the tags. precincts = ["A1", "A7", "B2", "B3", "C11", "C6", "D14", "D4", "E13", "E18", "E5"] precinct = None tags = [t["term"] for t in list_record["tags"]] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record["summary"] full_description = list_record["content"][0]["value"] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record["title"], list_record["link"])) return location = None location_name = u"" block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = location["address"] block = location["block"] location = location["point"] break else: self.logger.info("no addresses geocoded in %r" % list_record["title"]) return kwargs = dict( item_date=date, location=location, location_name=location_name, title=list_record["title"], description=description, url=list_record["link"], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = [ 'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5' ] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = location['address'] block = location['block'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict( item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def save(self, old_record, list_record, detail_record): # This gets called once all parsing and cleanup is done. # It looks a lot like our 'expedient hack' code above. # We can ignore detail_record since has_detail is False. date = datetime.date(*list_record['updated_parsed'][:3]) description = text_from_html(list_record['summary']) # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. # First we'll need some suitable text; throw away HTML tags. # full_description = list_record['content'][0]['value'] # full_description = text_from_html(full_description) grabber = places.location_grabber() addrs = grabber(description) # printing articles title for debugging # print list_record['title'] #if not addrs: # addrs = grabber(list_record['title']) # if not addrs: # self.logger.info("no addresses found") # return location = None location_name = u'' block = None grabber = places.place_grabber() addrs = grabber(description) #if not match is found article is assigned location of Kent State if not addrs: location_name = "Kent State" locationSyn = LocationSynonym.objects.get(pretty_name = location_name) location = Location.objects.get(name = locationSyn.location).location self.logger.info("no matches for place found. Using Kent State default") else: location = None location_name = u'' block = None #here we're checking the return results form the place grabber #for mathces in the database. first Places are checked then PlaceSynonyms. for l, r, name in addrs: #addr = addr.strip() try: print name place = Place.objects.get(pretty_name = name) location = place.location except Place.DoesNotExist: try: place = PlaceSynonym.objects.get(pretty_name = name) location = place.place.location except PlaceSynonym.DoesNotExist: self.logger.info("no addresses geocoded in %r" % list_record['title']) continue location_name = name # block = location['block'] # location = location['point'] break if location is None: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, description=description, title=list_record['title'], url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def clean_list_record(self, record): if record['title'].startswith(u'Boston 24'): # We don't include the summary posts. # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for the aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # Or maybe not: these are citywide, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") raise SkipRecord date = datetime.date(*record['updated_parsed'][:3]) description = record['summary'] # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. full_description = record['content'][0]['value'] full_description = text_from_html(full_description) location, location_name = self.get_point_and_location_name( record, address_text=full_description) if not (location or location_name): raise SkipRecord("No location or location_name") # Get the precinct from the tags. precincts = [ 'A1', 'A15', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5' ] tags = [t['term'] for t in record['tags']] precinct = None for tag in tags: if tag in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. For now we just save it as an attribute. precinct = tag break attributes = {} if precinct: precinct = self.get_or_create_lookup('precinct', precinct, precinct) attributes['precinct'] = precinct.id else: raise SkipRecord("no precinct found in tags %r" % tags) cleaned = dict( item_date=date, location=location, location_name=location_name, title=record['title'], description=description, url=record['link'], attributes=attributes, ) return cleaned