def test_unicode(self): from planet.spider import filename index = idindex.create() iri = 'http://www.\xe8\xa9\xb9\xe5\xa7\x86\xe6\x96\xaf.com/' index[filename('', iri)] = 'data' index[filename('', iri.decode('utf-8'))] = 'data' index[filename('', u'1234')] = 'data' index.close()
def test_expunge(self): config.load(configfile) # create test entries in cache with correct timestamp for entry in glob.glob(testentries): e=minidom.parse(entry) e.normalize() eid = e.getElementsByTagName('id') efile = filename(workdir, eid[0].childNodes[0].nodeValue) eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated)) if not eid or not eupdated: continue shutil.copyfile(entry, efile) os.utime(efile, (emtime, emtime)) # create test feeds in cache sources = config.cache_sources_directory() for feed in glob.glob(testfeeds): f=minidom.parse(feed) f.normalize() fid = f.getElementsByTagName('id') if not fid: continue ffile = filename(sources, fid[0].childNodes[0].nodeValue) shutil.copyfile(feed, ffile) # verify that exactly nine entries + one source dir were produced files = glob.glob(workdir+"/*") self.assertEqual(10, len(files)) # verify that exactly four feeds were produced in source dir files = glob.glob(sources+"/*") self.assertEqual(4, len(files)) # expunge... expungeCache() # verify that five entries and one source dir are left files = glob.glob(workdir+"/*") self.assertEqual(6, len(files)) # verify that the right five entries are left self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)
def create(): from planet import logger as log cache = config.cache_directory() index = os.path.join(cache, 'index') if not os.path.exists(index): os.makedirs(index) import anydbm index = anydbm.open(filename(index, 'id'), 'c') try: import libxml2 except: libxml2 = False from xml.dom import minidom for file in glob(cache + "/*"): if os.path.isdir(file): continue elif libxml2: try: doc = libxml2.parseFile(file) ctxt = doc.xpathNewContext() ctxt.xpathRegisterNs('atom', 'http://www.w3.org/2005/Atom') entry = ctxt.xpathEval('/atom:entry/atom:id') source = ctxt.xpathEval('/atom:entry/atom:source/atom:id') if entry and source: index[filename('', entry[0].content)] = source[0].content doc.freeDoc() except: log.error(file) else: try: doc = minidom.parse(file) doc.normalize() ids = doc.getElementsByTagName('id') entry = [e for e in ids if e.parentNode.nodeName == 'entry'] source = [e for e in ids if e.parentNode.nodeName == 'source'] if entry and source: index[filename('',entry[0].childNodes[0].nodeValue)] = \ source[0].childNodes[0].nodeValue doc.freeDoc() except: log.error(file) log.info(str(len(index.keys())) + " entries indexed") index.close() return open()
def create(): from planet import logger as log cache = config.cache_directory() index=os.path.join(cache,'index') if not os.path.exists(index): os.makedirs(index) import dbhash index = dbhash.open(filename(index, 'id'),'c') try: import libxml2 except: libxml2 = False from xml.dom import minidom for file in glob(cache+"/*"): if os.path.isdir(file): continue elif libxml2: try: doc = libxml2.parseFile(file) ctxt = doc.xpathNewContext() ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom') entry = ctxt.xpathEval('/atom:entry/atom:id') source = ctxt.xpathEval('/atom:entry/atom:source/atom:id') if entry and source: index[filename('',entry[0].content)] = source[0].content doc.freeDoc() except: log.error(file) else: try: doc = minidom.parse(file) doc.normalize() ids = doc.getElementsByTagName('id') entry = [e for e in ids if e.parentNode.nodeName == 'entry'] source = [e for e in ids if e.parentNode.nodeName == 'source'] if entry and source: index[filename('',entry[0].childNodes[0].nodeValue)] = \ source[0].childNodes[0].nodeValue doc.freeDoc() except: log.error(file) log.info(str(len(index.keys())) + " entries indexed") index.close() return open()
def test_filename(self): self.assertEqual(os.path.join('.', 'example.com,index.html'), filename('.', 'http://example.com/index.html')) self.assertEqual(os.path.join('.', 'planet.intertwingly.net,2006,testfeed1,1'), filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1')) self.assertEqual(os.path.join('.', '00000000-0000-0000-0000-000000000000'), filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000')) # Requires Python 2.3 try: import encodings.idna except: return self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'), filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
def destroy(): from planet import logger as log cache = config.cache_directory() index = os.path.join(cache, 'index') if not os.path.exists(index): return None idindex = filename(index, 'id') if os.path.exists(idindex): os.unlink(idindex) os.removedirs(index) log.info(idindex + " deleted")
def destroy(): from planet import logger as log cache = config.cache_directory() index=os.path.join(cache,'index') if not os.path.exists(index): return None idindex = filename(index, 'id') if os.path.exists(idindex): os.unlink(idindex) os.removedirs(index) log.info(idindex + " deleted")
def open(): try: cache = config.cache_directory() index = os.path.join(cache, 'index') if not os.path.exists(index): return None import anydbm return anydbm.open(filename(index, 'id'), 'w') except Exception, e: if e.__class__.__name__ == 'DBError': e = e.args[-1] from planet import logger as log log.error(str(e))
def open(): try: cache = config.cache_directory() index=os.path.join(cache,'index') if not os.path.exists(index): return None import dbhash return dbhash.open(filename(index, 'id'),'w') except Exception, e: if e.__class__.__name__ == 'DBError': e = e.args[-1] from planet import logger as log log.error(str(e))
def open(): try: cache = config.cache_directory() index = os.path.join(cache, "index") if not os.path.exists(index): return None return dbhash.open(filename(index, "id"), "w") except Exception, e: if e.__class__.__name__ == "DBError": e = e.args[-1] from planet import logger as log log.error(str(e))
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields if 'id' in entry.source: entry.new_feed = entry.source.id else: entry.new_feed = None entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields entry.new_feed = entry.source.id entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output
# Create the blacklist dir if it does not exist if not os.path.exists(blacklist): os.mkdir(blacklist) print "<p>Created directory %s</p>" % blacklist # find list of urls, in the form bl[n]=url for key in form.keys(): if not key.startswith("bl"): continue url = unquote(form[key].value) # find corresponding files cache_file = filename(cache, url) blacklist_file = filename(blacklist, url) # move to blacklist if found if os.path.exists(cache_file): os.rename(cache_file, blacklist_file) print "<p>Blacklisted <a href='%s'>%s</a></p>" % (url, url) else: print "<p>Unknown file: %s</p>" % cache_file print """ <p>Note that blacklisting does not automatically
def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True): from planet import logger import config try: import urllib2, StringIO from planet.spider import filename # list cache file name cache_filename = filename(config.cache_lists_directory(), list) # retrieve list options (e.g., etag, last-modified) from cache options = {} # add original options for key in orig_config.options(list): options[key] = orig_config.get(list, key) try: if use_cache: cached_config = ConfigParser() cached_config.read(cache_filename) for option in cached_config.options(list): options[option] = cached_config.get(list,option) except: pass cached_config = ConfigParser() cached_config.add_section(list) for key, value in options.items(): cached_config.set(list, key, value) # read list curdir=getattr(os.path, 'curdir', '.') if sys.platform.find('win') < 0: base = urljoin('file:', os.path.abspath(curdir)) else: path = os.path.abspath(os.path.curdir) base = urljoin('file:///', path.replace(':','|').replace('\\','/')) request = urllib2.Request(urljoin(base + '/', list)) if options.has_key("etag"): request.add_header('If-None-Match', options['etag']) if options.has_key("last-modified"): request.add_header('If-Modified-Since', options['last-modified']) response = urllib2.urlopen(request) if response.headers.has_key('etag'): cached_config.set(list, 'etag', response.headers['etag']) if response.headers.has_key('last-modified'): cached_config.set(list, 'last-modified', response.headers['last-modified']) # convert to config.ini data = StringIO.StringIO(response.read()) if callback: callback(data, cached_config) # write to cache if use_cache: cache = open(cache_filename, 'w') cached_config.write(cache) cache.close() # re-parse and proceed logger.debug("Using %s readinglist", list) if re_read: if use_cache: orig_config.read(cache_filename) else: cdata = StringIO.StringIO() cached_config.write(cdata) cdata.seek(0) orig_config.readfp(cdata) except: try: if re_read: if use_cache: if not orig_config.read(cache_filename): raise Exception() else: cdata = StringIO.StringIO() cached_config.write(cdata) cdata.seek(0) orig_config.readfp(cdata) logger.info("Using cached %s readinglist", list) except: logger.exception("Unable to read %s readinglist", list)
# load config files (default: config.ini) for arg in sys.argv[1:]: config.load(arg) if len(sys.argv) == 1: config.load('config.ini') from Queue import Queue from threading import Thread # determine which subscriptions have no icon but do have a html page fetch_queue = Queue() html = ['text/html', 'application/xhtml+xml'] sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.get('icon'): continue if not data.feed.get('links'): continue for link in data.feed.links: if link.rel == 'alternate' and link.type in html: fetch_queue.put((sub, link.href)) break # find the favicon for a given webpage def favicon(page): parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(urlopen(page)) favicon = urljoin(page, '/favicon.ico') for link in doc.getElementsByTagName('link'): if link.hasAttribute('rel') and link.hasAttribute('href'):