Пример #1
0
    def save(self, old_record, list_record, detail_record):
        # This gets called once all parsing and cleanup is done.
        # It looks a lot like our 'expedient hack' code above.

        # We can ignore detail_record since has_detail is False.

        date = datetime.date(*list_record['updated_parsed'][:3])
        description = text_from_html(list_record['summary'])

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        # First we'll need some suitable text; throw away HTML tags.
       # full_description = list_record['content'][0]['value']
       # full_description = text_from_html(full_description)
        grabber = places.location_grabber()
 
        addrs = grabber(description)
       # printing articles title for debugging
       # print list_record['title']

        #if not addrs:
	#    addrs = grabber(list_record['title'])
 	#    if not addrs:
        #  	self.logger.info("no addresses found")
        #    	return

        location = None
        location_name = u''
        block = None

	
	grabber = places.place_grabber()
 
        addrs = grabber(description)

        #if not match is found article is assigned location of Kent State
        if not addrs:
            location_name  = "Kent State"
            locationSyn = LocationSynonym.objects.get(pretty_name = location_name)
            location = Location.objects.get(name = locationSyn.location).location
            self.logger.info("no matches for place found. Using Kent State default")
	else:	
            location = None
            location_name = u''
            block = None
	    
	    
	    #here we're checking the return results form the place grabber
	    #for mathces in the database. first Places are checked then PlaceSynonyms.
	    for l, r, name in addrs:
            #addr = addr.strip()
                try:
		    print name
                    place = Place.objects.get(pretty_name = name)
                    location = place.location
                except Place.DoesNotExist:
		    try:
                        place = PlaceSynonym.objects.get(pretty_name = name)
		        location = place.place.location
		    
		    except PlaceSynonym.DoesNotExist:
			self.logger.info("no addresses geocoded in %r" % list_record['title'])
			continue
                location_name = name
               # block = location['block']
               # location = location['point']
                break
            if location is None:
                self.logger.info("no addresses geocoded in %r" % list_record['title'])
                return



        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      description=description,
                      title=list_record['title'],
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Пример #2
0
    def save(self, old_record, list_record, detail_record):
        # This gets called once all parsing and cleanup is done.
        # It looks a lot like our 'expedient hack' code above.

        # We can ignore detail_record since has_detail is False.

        date = datetime.date(*list_record['updated_parsed'][:3])
        description = text_from_html(list_record['summary'])

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        # First we'll need some suitable text; throw away HTML tags.
       # full_description = list_record['content'][0]['value']
       # full_description = text_from_html(full_description)
        grabber =  places.place_grabber()
	print  description + '\n'
	print list_record['summary']
        addrs = grabber(list_record['summary'])
       # printing articles title for debugging
       # print list_record['title']
	print addrs
        if not addrs:
            addrs = grabber(list_record['title'])
            if not addrs:
                self.logger.info("no addresses found")
                return

        location = None
        location_name = u''
        block = None
        # Ready to geocode. If we had one location_name to try,
        # this could be done automatically in create_or_update(), but
        # we have multiple possible location_names.
        for l, r, name in addrs:
            #addr = addr.strip()
            
            #aPlace = Place.objects.get(pretty_name = name)
            
		
            try:
                aPlace = Place.objects.get(pretty_name = name)
                location = aPlace.location
            except ObjectDoesNotExist:
                newslocation = PlaceSynonym.objects.get(pretty_name = name).place.location
	        print newslocation
                location = newslocation	
           # except GeocodingException:
            #    log_exception(level=logging.DEBUG)
             #   continue
            location_name = name
           # block = location['block']
           # location = location['point']
            break
        if location is None:
            self.logger.info("no addresses geocoded in %r" % list_record['title'])
            return
	print location
        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      description=description,
                      title=list_record['title'],
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Пример #3
0
	def update(self):

		#
		#
		# Download Calendar RSS feed and update database
		#
		#

		logger.info("Starting KSUStudentProgrammingScraper")

		feed = feedparser.parse(self.url)
		seencount = addcount = updatecount = 0
		for entry in feed.entries:

			seencount += 1
			title = convert_entities(entry.title)
			title = foo(title, '', ' (')
			try:
				item = NewsItem.objects.get(title=title,
											schema__id=self.schema.id)
				status = "updated"
			except NewsItem.DoesNotExist:
				item = NewsItem()
				status = "added"
			except NewsItem.MultipleObjectsReturned:
				logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
				continue
			try:

				#
				#
				# The actual rss feed elements are grabbed here
				#
				#

				itm_description = entry.description

				soup = BeautifulSoup(foo(itm_description,"</table><br />","<br /><br />"))
				locations = soup.findAll(text=True)
				location = locations[0].strip()
				place_grabber = places.place_grabber()
				grab_results = place_grabber(location)
				try:
					item.location = Place.objects.get(pretty_name=grab_results[0][2]).location
					item.location_name = Place.objects.get(pretty_name=grab_results[0][2]).pretty_name
				except:
					item.location = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.location
					item.location_name = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.pretty_name

				try:
					item.attributes['room'] = locations[1].strip().replace("Room: ","")
				except Exception as e:
					logger.info("Tried saving item.room, error: %s" % e)

				item.schema = self.schema
				item.title = title

				soup = BeautifulSoup(foo(itm_description,"<br /><br />","</td></tr>"))
				item.description = soup.findAll(text=True)
				item.description = item.description[0].strip()

				item.url = entry.link

				start_t = foo(itm_description,"Start Time:</b>&nbsp;</td><td>","</td>")
				start_t = dateutil.parser.parse(start_t)

				end_t = foo(itm_description,"End Time:</b>&nbsp;</td><td>","</td>")
				end_t = dateutil.parser.parse(end_t)

				end_dt = foo(itm_description,"End Date:</b>&nbsp;</td><td>","</td>")
				end_dt = dateutil.parser.parse(end_dt)

				item.item_date = dateutil.parser.parse(entry.category)
				item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

				item.attributes['start-time'] = start_t.time()
				item.attributes['end-time'] = end_t.time()

				item.save()

				if status == 'added':
					addcount += 1
				else:
					updatecount += 1
				logger.info("%s: %s" % (status, item.title))
			except Exception as e:
				logger.exception("unexpected error: %s" % e)
		logger.info("KSUStudentProgrammingScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))