def save(self, old_record, list_record, detail_record): # This gets called once all parsing and cleanup is done. # It looks a lot like our 'expedient hack' code above. # We can ignore detail_record since has_detail is False. date = datetime.date(*list_record['updated_parsed'][:3]) description = text_from_html(list_record['summary']) # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. # First we'll need some suitable text; throw away HTML tags. # full_description = list_record['content'][0]['value'] # full_description = text_from_html(full_description) grabber = places.location_grabber() addrs = grabber(description) # printing articles title for debugging # print list_record['title'] #if not addrs: # addrs = grabber(list_record['title']) # if not addrs: # self.logger.info("no addresses found") # return location = None location_name = u'' block = None grabber = places.place_grabber() addrs = grabber(description) #if not match is found article is assigned location of Kent State if not addrs: location_name = "Kent State" locationSyn = LocationSynonym.objects.get(pretty_name = location_name) location = Location.objects.get(name = locationSyn.location).location self.logger.info("no matches for place found. Using Kent State default") else: location = None location_name = u'' block = None #here we're checking the return results form the place grabber #for mathces in the database. first Places are checked then PlaceSynonyms. for l, r, name in addrs: #addr = addr.strip() try: print name place = Place.objects.get(pretty_name = name) location = place.location except Place.DoesNotExist: try: place = PlaceSynonym.objects.get(pretty_name = name) location = place.place.location except PlaceSynonym.DoesNotExist: self.logger.info("no addresses geocoded in %r" % list_record['title']) continue location_name = name # block = location['block'] # location = location['point'] break if location is None: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, description=description, title=list_record['title'], url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def save(self, old_record, list_record, detail_record): # This gets called once all parsing and cleanup is done. # It looks a lot like our 'expedient hack' code above. # We can ignore detail_record since has_detail is False. date = datetime.date(*list_record['updated_parsed'][:3]) description = text_from_html(list_record['summary']) # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. # First we'll need some suitable text; throw away HTML tags. # full_description = list_record['content'][0]['value'] # full_description = text_from_html(full_description) grabber = places.place_grabber() print description + '\n' print list_record['summary'] addrs = grabber(list_record['summary']) # printing articles title for debugging # print list_record['title'] print addrs if not addrs: addrs = grabber(list_record['title']) if not addrs: self.logger.info("no addresses found") return location = None location_name = u'' block = None # Ready to geocode. If we had one location_name to try, # this could be done automatically in create_or_update(), but # we have multiple possible location_names. for l, r, name in addrs: #addr = addr.strip() #aPlace = Place.objects.get(pretty_name = name) try: aPlace = Place.objects.get(pretty_name = name) location = aPlace.location except ObjectDoesNotExist: newslocation = PlaceSynonym.objects.get(pretty_name = name).place.location print newslocation location = newslocation # except GeocodingException: # log_exception(level=logging.DEBUG) # continue location_name = name # block = location['block'] # location = location['point'] break if location is None: self.logger.info("no addresses geocoded in %r" % list_record['title']) return print location kwargs = dict(item_date=date, location=location, location_name=location_name, description=description, title=list_record['title'], url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def update(self): # # # Download Calendar RSS feed and update database # # logger.info("Starting KSUStudentProgrammingScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: seencount += 1 title = convert_entities(entry.title) title = foo(title, '', ' (') try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: # # # The actual rss feed elements are grabbed here # # itm_description = entry.description soup = BeautifulSoup(foo(itm_description,"</table><br />","<br /><br />")) locations = soup.findAll(text=True) location = locations[0].strip() place_grabber = places.place_grabber() grab_results = place_grabber(location) try: item.location = Place.objects.get(pretty_name=grab_results[0][2]).location item.location_name = Place.objects.get(pretty_name=grab_results[0][2]).pretty_name except: item.location = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.location item.location_name = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.pretty_name try: item.attributes['room'] = locations[1].strip().replace("Room: ","") except Exception as e: logger.info("Tried saving item.room, error: %s" % e) item.schema = self.schema item.title = title soup = BeautifulSoup(foo(itm_description,"<br /><br />","</td></tr>")) item.description = soup.findAll(text=True) item.description = item.description[0].strip() item.url = entry.link start_t = foo(itm_description,"Start Time:</b> </td><td>","</td>") start_t = dateutil.parser.parse(start_t) end_t = foo(itm_description,"End Time:</b> </td><td>","</td>") end_t = dateutil.parser.parse(end_t) end_dt = foo(itm_description,"End Date:</b> </td><td>","</td>") end_dt = dateutil.parser.parse(end_dt) item.item_date = dateutil.parser.parse(entry.category) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.attributes['start-time'] = start_t.time() item.attributes['end-time'] = end_t.time() item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info("KSUStudentProgrammingScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))