def expungeCache(): """ Expunge old entries from a cache of entries """ log = planet.logger log.info("Determining feed subscriptions") entry_count = {} sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if not data.feed.has_key('id'): continue if config.feed_options(sub).has_key('cache_keep_entries'): entry_count[data.feed.id] = int(config.feed_options(sub)['cache_keep_entries']) else: entry_count[data.feed.id] = config.cache_keep_entries() log.info("Listing cached entries") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() for mtime,file in dir: try: entry=minidom.parse(file) # determine source of entry entry.normalize() sources = entry.getElementsByTagName('source') if not sources: # no source determined, do not delete log.debug("No source found for %s", file) continue ids = sources[0].getElementsByTagName('id') if not ids: # feed id not found, do not delete log.debug("No source feed id found for %s", file) continue if ids[0].childNodes[0].nodeValue in entry_count: # subscribed to feed, update entry count entry_count[ids[0].childNodes[0].nodeValue] = entry_count[ ids[0].childNodes[0].nodeValue] - 1 if entry_count[ids[0].childNodes[0].nodeValue] >= 0: # maximum not reached, do not delete log.debug("Maximum not reached for %s from %s", file, ids[0].childNodes[0].nodeValue) continue else: # maximum reached log.debug("Removing %s, maximum reached for %s", file, ids[0].childNodes[0].nodeValue) else: # not subscribed log.debug("Removing %s, not subscribed to %s", file, ids[0].childNodes[0].nodeValue) # remove old entry os.unlink(file) except: log.error("Error parsing %s", file)
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items = max([ config.items_per_page(templ) for templ in config.template_files() or ['Planet'] ]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet', planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue xdoc = minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} new_feed_items = config.new_feed_items() for mtime, file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry = minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagName('source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id, 0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items=max([config.items_per_page(templ) for templ in config.template_files() or ['Planet']]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.pubsubhubbub_hub(): hub = doc.createElement('link') hub.setAttribute('rel', 'hub') hub.setAttribute('href', config.pubsubhubbub_hub()) feed.appendChild(hub) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet',planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue # warn on missing links if not data.feed.has_key('planet_message'): if not data.feed.has_key('links'): data.feed['links'] = [] for link in data.feed.links: if link.rel == 'self': break else: log.debug('missing self link for ' + sub) for link in data.feed.links: if link.rel == 'alternate' and 'html' in link.type: break else: log.debug('missing html link for ' + sub) xdoc=minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} atomNS='http://www.w3.org/2005/Atom' new_feed_items = config.new_feed_items() for mtime,file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry=minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagNameNS(atomNS, 'source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id,0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: log.warn('Skipping: ' + id) if id not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc
def expungeCache(): """ Expunge old entries from a cache of entries """ log = planet.logger log.info("Determining feed subscriptions") entry_count = {} sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if not data.feed.has_key('id'): continue if config.feed_options(sub).has_key('cache_keep_entries'): entry_count[data.feed.id] = int( config.feed_options(sub)['cache_keep_entries']) else: entry_count[data.feed.id] = config.cache_keep_entries() log.info("Listing cached entries") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() for mtime, file in dir: try: entry = minidom.parse(file) # determine source of entry entry.normalize() sources = entry.getElementsByTagName('source') if not sources: # no source determined, do not delete log.debug("No source found for %s", file) continue ids = sources[0].getElementsByTagName('id') if not ids: # feed id not found, do not delete log.debug("No source feed id found for %s", file) continue if ids[0].childNodes[0].nodeValue in entry_count: # subscribed to feed, update entry count entry_count[ids[0].childNodes[0].nodeValue] = entry_count[ ids[0].childNodes[0].nodeValue] - 1 if entry_count[ids[0].childNodes[0].nodeValue] >= 0: # maximum not reached, do not delete log.debug("Maximum not reached for %s from %s", file, ids[0].childNodes[0].nodeValue) continue else: # maximum reached log.debug("Removing %s, maximum reached for %s", file, ids[0].childNodes[0].nodeValue) else: # not subscribed log.debug("Removing %s, not subscribed to %s", file, ids[0].childNodes[0].nodeValue) # remove old entry os.unlink(file) except: log.error("Error parsing %s", file)
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items = max([ config.items_per_page(templ) for templ in config.template_files() or ['Planet'] ]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.pubsubhubbub_hub(): hub = doc.createElement('link') hub.setAttribute('rel', 'hub') hub.setAttribute('href', config.pubsubhubbub_hub()) feed.appendChild(hub) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet', planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue # warn on missing links if not data.feed.has_key('planet_message'): if not data.feed.has_key('links'): data.feed['links'] = [] for link in data.feed.links: if link.rel == 'self': break else: log.debug('missing self link for ' + sub) for link in data.feed.links: if link.rel == 'alternate' and 'html' in link.type: break else: log.debug('missing html link for ' + sub) xdoc = minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} atomNS = 'http://www.w3.org/2005/Atom' new_feed_items = config.new_feed_items() posted_urls = set() if config.post_to_twitter(): if os.path.exists(posted_urls_file): try: with open(posted_urls_file, 'rb') as f: posted_urls = pickle.load(f) except Exception as ex: log.error("Error reading posted_urls %s", ex) # print(posted_urls) for mtime, file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry = minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagNameNS(atomNS, 'source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id, 0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: log.warn('Skipping: ' + id) if id not in sub_ids: continue # Twitter integration if config.post_to_twitter(): url = None twitter = None title = "Untitled post..." links = entry.getElementsByTagName('link') if links: for link in links: if link.hasAttribute('rel') and link.hasAttribute( 'type') and link.hasAttribute('href'): if (link.getAttribute('rel') == 'alternate' and link.getAttribute('type') == 'text/html'): url = link.getAttribute('href') break titles = entry.getElementsByTagName('title') if titles: title = unicode( titles[0].firstChild.nodeValue.encode('utf-8'), 'utf-8').strip() handles = entry.getElementsByTagName('planet:twitter') if (handles): twitter = unicode( handles[0].firstChild.nodeValue.encode('utf-8'), "utf-8") if url is not None and url not in posted_urls: # log.debug("Going to post URL to Twitter: twitter='{}' title='{}', url='{}'".format(twitter, title, url)) txt_append = u'' if twitter: txt_append = u" (by @" + twitter.encode( 'utf-8').strip() + u")" max_title_len = 280 - 20 - len(txt_append) if (len(title) > max_title_len): title = title[:max_title_len] txt = title + txt_append + u"\n" + url log.debug(u"Text to post '{}'".format(txt)) try: posted_urls.add(url) config.twitter_api.update_status(txt) except Exception as ex: log.error(u"Error posting to Twitter: %s", ex) # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except Exception as ex: log.error("Error parsing %s: %s", file, ex) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) if config.post_to_twitter(): with open(posted_urls_file, 'wb') as f: pickle.dump(posted_urls, f, protocol=pickle.HIGHEST_PROTOCOL) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.getLogger(config.log_level(),config.log_format()) log.info("Loading cached data") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items=max([config.items_per_page(templ) for templ in config.template_files() or ['Planet']]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet',planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue xdoc=minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 for mtime,file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry=minidom.parse(file) # verify that this entry is currently subscribed to entry.normalize() sources = entry.getElementsByTagName('source') if sources: ids = sources[0].getElementsByTagName('id') if ids and ids[0].childNodes[0].nodeValue not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue if ids[0].childNodes[0].nodeValue not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc