Exemplo n.º 1
0
 def normalize(self):
     """Checks that all attribute values are in order so that the entry 
     can be used for output, particularly in an Atom feed."""
     if self.title is None:
         self.title = ''
     else:
         self.title = self.title.strip()
     if self.summary is None:
         self.summary = ''
     else:
         self.summary = self.summary.strip()
     if self.content == '' or self.content is None:
         self.content = self.summary
     else:
         self.content = self.content.strip()
     if self.published is None:
         self.published = self.date
     if self.published_parsed is None:
         self.published_parsed = parse_date(self.published)
     if self.created is None:
         self.created = self.date
     if self.created_parsed is None:
         self.created_parsed = parse_date(self.created)
     if self.updated is None:
         self.updated = self.date
     if self.updated_parsed is None:
         self.updated_parsed = parse_date(self.updated)
     self.date_atom = time.strftime(config.ATOM_TIME_FORMAT, self.date_parsed)
     self.published_atom = time.strftime(config.ATOM_TIME_FORMAT, self.published_parsed)
     self.created_atom = time.strftime(config.ATOM_TIME_FORMAT, self.created_parsed)
     self.updated_atom = time.strftime(config.ATOM_TIME_FORMAT, self.updated_parsed)            
     self.date_formatted = time.strftime(config.HTML_TIME_FORMAT, self.date_parsed)
     self.published_formatted = time.strftime(config.HTML_TIME_FORMAT, self.published_parsed)
     self.created_formatted = time.strftime(config.HTML_TIME_FORMAT, self.created_parsed)
     self.updated_formatted = time.strftime(config.HTML_TIME_FORMAT, self.updated_parsed)
     # Build GUID
     if self.id is None:
         self.id = self.get_tag_uri(self.date_parsed, self.url)
     # Truncate content for main page
     if publish.shorten.wc(self.content) > config.WORD_LIMIT:
         self.content_abridged = publish.shorten.shorten(self.content, config.WORD_LIMIT)
     else:
         self.content_abridged = self.content
     # Sanitize content
     self.title = publish.sanitizer.sanitize(self.title)
     self.summary = publish.sanitizer.strip(self.summary)
     #self.summary = publish.sanitizer.sanitize(self.summary)
     # If the entry is a photo, allow <img> tag
     if self.type == 'photo':
         self.content = publish.sanitizer.sanitize(self.content, additional_tags=[ 'img' ])
     elif self.type == 'quote':
         self.content = publish.sanitizer.block_to_break(self.content)
         self.content = publish.sanitizer.sanitize(self.content, additional_tags = [ 'br' ])
     else:
         self.content = publish.sanitizer.heading_to_bold(self.content)
         self.content = publish.sanitizer.sanitize(self.content, additional_tags = [ 'p', 'br', 'blockquote' ])
     self.content_abridged = publish.sanitizer.heading_to_bold(self.content_abridged)
     self.content_abridged = publish.sanitizer.sanitize(self.content_abridged, additional_tags = [ 'p', 'br', 'blockquote' ])
def twentyFourHourFrequency():
	#this function works out the hourly frequency of entries of passed legislation in the last 24hrs of legislative activity 
	#using http://www.legislation.gov.uk/new/data.feed
	#It does this by collecting entries of passed legislation in the last 24 hr period
	#Except if it is a monday - then it collects the number of entries back to & including Friday (72hrs) 
	#Friday is used because no legislation is passed on a wkend and Friday would have been within the last 24hrs of legislative activity
	#On a weekend the hourly frequency will always be 0
	feed = feedparser.parse('http://www.legislation.gov.uk/new/data.feed')
	entries_in_collection_period = 0
	currentTime = (tuple(time.gmtime()))
	timestamp_currentTime = calendar.timegm(currentTime)
	for x in xrange(0,len(feed.entries)): #loop through all the entries
		#print feed.entries[x].title
		timestamp_entryDate = mktime(parse_date(feed.entries[x].published)) #get each entry published date and change to unix format
		#print timestamp_entryDate
		#print datetime.fromtimestamp(timestamp_entryDate), 'date published'
		hours_old = (timestamp_currentTime - timestamp_entryDate)/ 3600.0
		#print hours_old, 'hours old'
		current_weekday = calendar.day_name[date.today().weekday()] #get current weekday name eg Monday
		if current_weekday=='Monday' and hours_old < 72: #if its Monday find entries back to Friday (72hrs)
			entries_in_collection_period += 1
			print 'added to the frequency list\n'
		elif hours_old < 24: #else find entries in the last 24hrs
			entries_in_collection_period += 1
	hourly_frequency = float(entries_in_collection_period)/24 #calc the hourly frequency of entries
	msg = ['\n', str(feed.updated), ' :atom feed update\n', str(entries_in_collection_period), ' entries_in_collection_period: ', '\nA piece of legislation was passed every\n', str(hourly_frequency), ' hrs in the last 24hrs since the atom update time\n======']
	writeTofile('output.txt', "".join(msg))
	print "".join(msg)
	return hourly_frequency
Exemplo n.º 3
0
def rfc3339_to_datetime(dt):
	"""
	Return the RFC 3339 datetime string in `dt` as timezone-aware Django
	datetime object.
	"""
	utc_dt = pytz.utc.localize(datetime.datetime.fromtimestamp(time.mktime(parse_date(dt))))
	time_zone = pytz.timezone(settings.TIME_ZONE)
	return utc_dt.astimezone(time_zone)
Exemplo n.º 4
0
 def normalize(self):
     """Checks that all attribute values are in order so that the source 
     can be used in an Atom feed."""
     if self.id is None:
         # TODO: Generate unique ID (maybe unnecessary)
         pass
     if len(self.entries) > 0:
         if self.updated is None:
             self.updated = self.entries[0].updated
         if self.updated_parsed is None:
             self.updated_parsed = parse_date(self.updated)
Exemplo n.º 5
0
 def parse(self):
     """Fetches Twitter tweets using the Twitter API."""
     self.logger.debug("Contacting Twitter services")
     try:
         twitty = twitter.Api(
             username=config.TWITTER_ACCOUNT, 
             password=config.TWITTER_PASSWORD
         )
         self.logger.info("Getting Twitter tweets for %s" % self.owner)
         tweets = twitty.GetUserTimeline(self.name)
         for tweet in tweets:
             skip = False
             e = Quote()
             e.source.name = self.name
             e.source.url = self.url
             e.author = tweet.user.name
             e.title = "Tweet from %s" % e.author
             e.summary = tweet.text
             e.content = e.summary
             # Add hyperlinks
             # e.content = TwitterStatus.link_users(e.content)
             # e.content = TwitterStatus.link_hashtags(e.content)
             e.citation = e.author
             self.logger.info("Tweet: '%s'" % e.summary)
             e.url = self.get_tweet_url(tweet.id)
             self.logger.debug("Tweet URL: %s" % e.url)
             e.date = tweet.created_at
             e.date_parsed = parse_date(e.date)
             self.logger.debug("Tweet date: %s" % e.date_as_string(e.date_parsed))
             # Skip this tweet if replies are turned off
             if self.ignore_replies and TwitterStatus.is_reply(e.summary):
                 skip = True
             # Skip this tweet if replies are turned off
             if self.ignore_retweets and TwitterStatus.is_retweet(e.summary):
                 skip = True
             # Skip this tweet if it's in the exclusion list
             if self.excluded_keywords is not None:
                 for keyword in self.excluded_keywords:
                     self.logger.debug("Checking for excluded keyword: '%s'" % keyword)
                     if e.summary.lower().find(keyword.lower()) > -1:
                         self.logger.debug("Skipping tweet with excluded keyword: '%s'" % keyword)
                         skip = True
             if skip:
                 continue
             else:
                 self.entries.append(e)
     except BadStatusLine:
         self.logger.exception("Twitter.com unexpectedly closed the connection!")
     except HTTPError, err:
         self.logger.exception("HTTP error: '%s'" % err)
Exemplo n.º 6
0
 def parse_rfc3339_date(self, dateString):
     mydt = parse_date(dateString.replace('Z', '-02:00'))
     weekdayId = mydt.tm_wday
     hour = mydt.tm_hour
     if hour >= 8 and hour <= 17:
         time_of_day = 'Morning'
     elif hour > 17 and hour < 22:
         time_of_day = 'Evening'
     else:
         time_of_day = 'Night'
     retobj = {
         'weekdayId': weekdayId,
         'weekdayName': self.enum_weekdays(weekdayId),
         'hour': hour,
         'time_of_day': time_of_day,
     }
     return retobj
Exemplo n.º 7
0
 def parse_rfc3339_date(self, dateString):
     mydt = parse_date(dateString.replace('Z','-02:00'))
     weekdayId = mydt.tm_wday
     hour = mydt.tm_hour
     if hour >= 8 and hour <= 17:
         time_of_day = 'Morning'
     elif hour > 17 and hour < 22:
         time_of_day = 'Evening'
     else:
         time_of_day = 'Night'    
     retobj = {
         'weekdayId': weekdayId,
         'weekdayName': self.enum_weekdays(weekdayId),
         'hour': hour,
         'time_of_day': time_of_day,
     } 
     return retobj
Exemplo n.º 8
0
def frequencyOfLegislation():
	print '\natom feed: http://www.legislation.gov.uk/new/data.feed'
	print 'this atom feed was updated at', feed.entries[0].updated, '\n'
	storeTheDay = []
	for x in xrange(0,len(feed.entries)): #loop through all the entries
		print feed.entries[x].title
		fullDateTuple = tuple(parse_date(feed.entries[x].published)) #get the published date and cahnge to tuple format
		print 'Day:', fullDateTuple[2], 'th' #print just the day
		storeTheDay.append(fullDateTuple[2]) #add each day to the storeTheDay list - this relies on the date format not changing - not good!

	frequency = dict((i, storeTheDay.count(i)) for i in storeTheDay) #make a dict of the values to get the frequency of the days in the storeTheDay list 
	now = datetime.datetime.now()
	currentDay = "%d" % now.day
	for key in frequency:
	    print '\nOn the', key, 'th there were', frequency[key], 'pieces of legislation'
	    print 'There is a piece of legislation every', float(int(frequency[key]))/24, 'hours\n'
	    if frequency[key] == currentDay:
	    	print 'Today, right now, there is a piece of legislation every', float(int(frequency[key]))/24, 'hours\n'
Exemplo n.º 9
0
 def parse(self):
     """Fetches Tumblr API data and parses it."""
     self.logger.info("Fetching API data at '%s'" % self.api_url)
     self.http_response, self.http_content = spider.fetch(self.api_url)
     self.logger.info("Parsing API data for entries...")
     t = tumblr.parse(self.api_url)
     for post in t.posts:
         try:
             if post.type == 'regular':
                 self.logger.info("Tumblr post type: regular")
                 e = Post()
                 e.title = post.title
                 e.summary = post.content
                 e.content = post.content
             elif post.type == 'link':
                 if 'link' in self.excluded_types:
                     self.logger.debug("Skipping Tumblr link")
                     continue
                 else:
                     self.logger.info("Tumblr post type: link")
                     e = Link()
                     e.title = post.title
                     e.summary = post.content
                     e.content = post.content
                     e.url = post.related
                     e.comments = post.url
             elif post.type == 'quote':
                 self.logger.info("Tumblr post type: quote")
                 e = Quote()
                 e.summary = post.content
                 # Chop the smart quotes that Tumblr automatically 
                 # adds to to a quote                
                 e.summary = e.summary.lstrip("&#8220;").rstrip("&#8221;")
                 e.content = e.summary
                 # Get the quote's citation, and, if possible its source
                 e.citation = post.source
                 try:
                     soup = BeautifulSoup(e.citation)
                     e.citation_url = soup.find('a').get('href')
                     e.via = e.citation_url
                 except AttributeError:
                     e.citation_url = None
             elif post.type == 'photo':
                 self.logger.info("Tumblr post type: photo")
                 e = Photo()
                 e.photo_type = 'tumblr'
                 e.title = ''
                 e.summary = post.caption
                 #e.content = e.summary
                 # post.urls is a dictionary of photo URLs keyed by size.
                 # Let's get the big one.
                 e.photo_url = post.urls['500']
                 e.cached_url = config.IMAGES_URL + '/' + e._get_cached_original_shortname()
                 self.logger.debug("Tumblr photo URL: '%s'" % e.photo_url)
                 e.cache()
                 e.set_dimensions()
                 e.set_content()
             # Conversation, Video, and Audio post types aren't 
             # going to be implemented for a while
             elif post.type == 'conversation':
                 # TODO: Support Tumblr conversations
                 self.logger.info("Tumblr post type: conversation")
                 continue
                 #e = Conversation()
             elif post.type == 'video':
                 # TODO: Support Tumblr videos
                 self.logger.info("Tumblr post type: video")
                 continue
                 #e = Video()
             elif post.type == 'audio':
                 # TODO: Support Tumblr audio
                 self.logger.info("Tumblr post type: audio")
                 continue
                 #e = Audio()
             e.source.name = self.name
             e.source.url = self.url
             if e.url == '':
                 e.url = post.url
             e.author = self.owner
             e.date = post.date
             e.date_parsed = parse_date(post.date)
             self.logger.debug("Tumblr post date: %s" % e.date_as_string(e.date_parsed))
             self.logger.info("Entry title: '%s'" % e.title)
             self.logger.debug("Entry URL: '%s'" % e.url)
             self.entries.append(e)
         except AttributeError:
             # FIXME: Why is this exception handler here???
             pass
Exemplo n.º 10
0
def getnewbooks():
    """Routine to query cps to get the latest books added. return array of new books

    """
    logger.info('Getting new books from server')
    d = feedparser.parse('http://' + config.settings['username'] + ':' +
                         config.settings['password'] + '@' +
                         config.settings['serveraddress'])

    if d.bozo == 1:
        logger.error('Username, password, or Server Address url is incorrect.')
        return False
    else:
        logger.info('Name of the feed:' + d.feed.title)
        logger.info('Looking for books uploaded in the last: ' +
                    str(config.settings['numofdaysfornotification']) +
                    ' days.')

    _thumbnail_uri = u'http://opds-spec.org/image/thumbnail'
    recent_books = []

    if d.status == 200:
        for book in d.entries:
            dt = datetime.fromtimestamp(mktime(parse_date(book.updated)))
            if datetime.now() - dt < timedelta(
                    days=int(config.settings['numofdaysfornotification'])):
                if 'title' in book:
                    logger.info('Found book. Title: ' + book.title)
                else:
                    logger.info('Found book. Strange, no Title field!')

                # Need to get a uniqueID for naming the CIDs in the html newsletter - pulling the GUID from the link url
                # While we are here, might as well find out if the book has a cover
                #  if the book has a cover set:
                #        the book_cover_id to the GUID
                #        pull the url to make it easier to get to
                #        go get the cover and resize it
                #  if the book doesn't have a cover, set book_cover_id to 'Unknown.png'
                # Also setup the book_location - we'll use that in the newsletter to point to the book
                for _entry in book.links:
                    if _entry.rel == _thumbnail_uri:

                        try:
                            book['book_location'] = config.settings[
                                'serverbookurl'] + (_entry.href.rsplit('/',
                                                                       1)[1])
                            book_cover_id = book.link.rsplit('/', 1)[1]
                            book["book_cover_id"] = book_cover_id
                            book["cover_thumbnail"] = get_thumbnail(
                                _entry.href)
                            logger.debug('    Book has cover.')
                        except:
                            logger.debug('    Error in getting book cover.')
                            book["book_cover_id"] = "Unknown.png"
                            book['book_location'] = "#"

                    if book.get('book_cover_id', 'nope') == 'nope':
                        logger.debug('    Book nas no cover.')
                        book["book_cover_id"] = "Unknown.png"

                # The book summary that are posted with OPDS feeds can be long
                # Need to check for the size and if it's beyond a set size, reduce it
                try:
                    if len(book['summary']
                           ) >= config.settings['SUMMARY_LENGTH']:
                        book['short_summary'] = book[
                            'summary'][:config.settings[
                                'SUMMARY_LENGTH']] + "...see site..."
                        logger.debug(
                            '    Book summary too long. Being shorten.')
                    elif len(book['summary']) == 0:
                        book['short_summary'] = 'No summary information.'
                        logger.debug('    Book summary does not exist.')
                    else:
                        book['short_summary'] = book["summary"]
                        logger.debug(
                            '    Book summary too long. Being shorten.')

                except:
                    book['short_summary'] = 'No summary information.'

                # add newly added book to array
                recent_books.append(book)

        return recent_books

    else:
        logger.error(
            'Error getting opds feed! - Please check config. Status Code: ' +
            str(d.status))
        return False
Exemplo n.º 11
0
def s2ts(s):
    from calendar import timegm
    from feedparser import _parse_date as parse_date
    return int(timegm(parse_date(s)))
Exemplo n.º 12
0
 def parse(self):
     """Gets recent photos from a photostream, caches and thumbnails them."""
     self.logger.debug("Contacting Flickr Services")
     # Using flickrapi's 'etree' options requires ElementTree, 
     # which is standard with Python 2.5, but a separate install with 
     # Python 2.4.  The flickrapi module must also be patched 
     # using 'patches/flickrapi.patch' when using Python 2.4.
     try:
         flickr = flickrapi.FlickrAPI(config.FLICKR_KEY, format='etree')
         extras = 'date_upload,date_taken,last_update,owner_name,media,tags,license'
         self.logger.info("Getting photos for %s" % self.owner)
         photos = flickr.people_getPublicPhotos(
                     user_id=self.flickr_id, 
                     safe_search=SAFESEARCH_RESTRICTED, 
                     extras=extras
         )
         for photo in photos:
             e = Photo()
             e.photo_type = 'flickr'
             e.source.name = self.name
             e.source.url = self.url
             # This only gets the most recent photo, which is really 
             # a bug, but I like this behavior.  Too many photos 
             # clutter things up.
             p = photo.find('photo')
             #if p.get('media') == 'video':
             #    self.logger.info("Skipping Flickr video")
             #    continue
             e.title = p.get('title', 'untitled')
             if e.title.strip() == '':
                 e.title = 'untitled'
             self.logger.info("Photo title: '%s'" % e.title)
             e.photo_id = p.get('id')
             e.farm_id = p.get('farm')
             e.secret = p.get('secret')
             e.server = p.get('server')
             e.photo_url = e._get_flickr_photo_url(
                 e.farm_id, 
                 e.server, 
                 e.photo_id, 
                 e.secret
             )
             self.logger.debug("Photo image URL: '%s'" % e.photo_url)
             e.url = e._get_flickr_url(self.flickr_id, e.photo_id)
             e.cached_url = config.IMAGES_URL + '/' + e._get_cached_original_shortname()
             self.logger.debug("Photo Flickr page URL: '%s'" % e.url)
             e.cache()
             e.set_dimensions()
             e.date = p.get('dateupload')
             e.date_parsed = datetime.datetime.utcfromtimestamp(float(e.date)).timetuple()
             e.published = e.date
             e.published_parsed = e.date_parsed
             e.created = p.get('datetaken', e.date)
             if e.created == e.date:
                 e.created_parsed = e.date_parsed
             else:
                 e.created_parsed = parse_date(e.created)
             e.updated = p.get('lastupdate', e.date)
             e.updated_parsed = datetime.datetime.utcfromtimestamp(float(e.updated)).timetuple()
             # Okay, now get the detailed photo info
             self.logger.debug("Making photos.getInfo API call...")
             photo_info = flickr.photos_getInfo(photo_id=e.photo_id, secret=e.secret)
             e.summary = photo_info.find('photo').find('description').text
             if e.summary is None:
                 e.summary = ''
             e.author = photo_info.find('photo').find('owner').get('realname')
             if e.author == '':
                 e.author = self.owner
             e.set_content()
             self.entries.append(e)
     except FlickrError, err:
         self.logger.exception("Flickr API error: '%s'" % err)