Exemplo n.º 1
0
 def test_unicode(self):
     from planet.spider import filename
     index = idindex.create()
     iri = 'http://www.\xe8\xa9\xb9\xe5\xa7\x86\xe6\x96\xaf.com/'
     index[filename('', iri)] = 'data'
     index[filename('', iri.decode('utf-8'))] = 'data'
     index[filename('', u'1234')] = 'data'
     index.close()
Exemplo n.º 2
0
 def test_unicode(self):
     from planet.spider import filename
     index = idindex.create()
     iri = 'http://www.\xe8\xa9\xb9\xe5\xa7\x86\xe6\x96\xaf.com/'
     index[filename('', iri)] = 'data'
     index[filename('', iri.decode('utf-8'))] = 'data'
     index[filename('', u'1234')] = 'data'
     index.close()
Exemplo n.º 3
0
    def test_expunge(self):
        config.load(configfile)

        # create test entries in cache with correct timestamp
        for entry in glob.glob(testentries):
            e=minidom.parse(entry)
            e.normalize()
            eid = e.getElementsByTagName('id')
            efile = filename(workdir, eid[0].childNodes[0].nodeValue)
            eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue
            emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated))
            if not eid or not eupdated: continue
            shutil.copyfile(entry, efile)
            os.utime(efile, (emtime, emtime))
  
        # create test feeds in cache
        sources = config.cache_sources_directory()
        for feed in glob.glob(testfeeds):
                f=minidom.parse(feed)
                f.normalize()
                fid = f.getElementsByTagName('id')
                if not fid: continue
                ffile = filename(sources, fid[0].childNodes[0].nodeValue)
                shutil.copyfile(feed, ffile)

        # verify that exactly nine entries + one source dir were produced
        files = glob.glob(workdir+"/*")
        self.assertEqual(10, len(files))

        # verify that exactly four feeds were produced in source dir
        files = glob.glob(sources+"/*")
        self.assertEqual(4, len(files))

        # expunge...
        expungeCache()

        # verify that five entries and one source dir are left
        files = glob.glob(workdir+"/*")
        self.assertEqual(6, len(files))

        # verify that the right five entries are left
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)
Exemplo n.º 4
0
    def test_expunge(self):
        config.load(configfile)

        # create test entries in cache with correct timestamp
        for entry in glob.glob(testentries):
            e=minidom.parse(entry)
            e.normalize()
            eid = e.getElementsByTagName('id')
            efile = filename(workdir, eid[0].childNodes[0].nodeValue)
            eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue
            emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated))
            if not eid or not eupdated: continue
            shutil.copyfile(entry, efile)
            os.utime(efile, (emtime, emtime))
  
        # create test feeds in cache
        sources = config.cache_sources_directory()
        for feed in glob.glob(testfeeds):
                f=minidom.parse(feed)
                f.normalize()
                fid = f.getElementsByTagName('id')
                if not fid: continue
                ffile = filename(sources, fid[0].childNodes[0].nodeValue)
                shutil.copyfile(feed, ffile)

        # verify that exactly nine entries + one source dir were produced
        files = glob.glob(workdir+"/*")
        self.assertEqual(10, len(files))

        # verify that exactly four feeds were produced in source dir
        files = glob.glob(sources+"/*")
        self.assertEqual(4, len(files))

        # expunge...
        expungeCache()

        # verify that five entries and one source dir are left
        files = glob.glob(workdir+"/*")
        self.assertEqual(6, len(files))

        # verify that the right five entries are left
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files)
        self.assertTrue(os.path.join(workdir,
            'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)
Exemplo n.º 5
0
def create():
    from planet import logger as log
    cache = config.cache_directory()
    index = os.path.join(cache, 'index')
    if not os.path.exists(index): os.makedirs(index)
    import anydbm
    index = anydbm.open(filename(index, 'id'), 'c')

    try:
        import libxml2
    except:
        libxml2 = False
        from xml.dom import minidom

    for file in glob(cache + "/*"):
        if os.path.isdir(file):
            continue
        elif libxml2:
            try:
                doc = libxml2.parseFile(file)
                ctxt = doc.xpathNewContext()
                ctxt.xpathRegisterNs('atom', 'http://www.w3.org/2005/Atom')
                entry = ctxt.xpathEval('/atom:entry/atom:id')
                source = ctxt.xpathEval('/atom:entry/atom:source/atom:id')
                if entry and source:
                    index[filename('', entry[0].content)] = source[0].content
                doc.freeDoc()
            except:
                log.error(file)
        else:
            try:
                doc = minidom.parse(file)
                doc.normalize()
                ids = doc.getElementsByTagName('id')
                entry = [e for e in ids if e.parentNode.nodeName == 'entry']
                source = [e for e in ids if e.parentNode.nodeName == 'source']
                if entry and source:
                    index[filename('',entry[0].childNodes[0].nodeValue)] = \
                        source[0].childNodes[0].nodeValue
                doc.freeDoc()
            except:
                log.error(file)

    log.info(str(len(index.keys())) + " entries indexed")
    index.close()

    return open()
Exemplo n.º 6
0
def create():
    from planet import logger as log
    cache = config.cache_directory()
    index=os.path.join(cache,'index')
    if not os.path.exists(index): os.makedirs(index)
    import dbhash
    index = dbhash.open(filename(index, 'id'),'c')

    try:
        import libxml2
    except:
        libxml2 = False
        from xml.dom import minidom

    for file in glob(cache+"/*"):
        if os.path.isdir(file):
            continue
        elif libxml2:
            try:
                doc = libxml2.parseFile(file)
                ctxt = doc.xpathNewContext()
                ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom')
                entry = ctxt.xpathEval('/atom:entry/atom:id')
                source = ctxt.xpathEval('/atom:entry/atom:source/atom:id')
                if entry and source:
                    index[filename('',entry[0].content)] = source[0].content
                doc.freeDoc()
            except:
                log.error(file)
        else:
            try:
                doc = minidom.parse(file)
                doc.normalize()
                ids = doc.getElementsByTagName('id')
                entry = [e for e in ids if e.parentNode.nodeName == 'entry']
                source = [e for e in ids if e.parentNode.nodeName == 'source']
                if entry and source:
                    index[filename('',entry[0].childNodes[0].nodeValue)] = \
                        source[0].childNodes[0].nodeValue
                doc.freeDoc()
            except:
                log.error(file)

    log.info(str(len(index.keys())) + " entries indexed")
    index.close()

    return open()
Exemplo n.º 7
0
    def test_filename(self):
        self.assertEqual(os.path.join('.', 'example.com,index.html'),
            filename('.', 'http://example.com/index.html'))
        self.assertEqual(os.path.join('.',
            'planet.intertwingly.net,2006,testfeed1,1'),
            filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1'))
        self.assertEqual(os.path.join('.',
            '00000000-0000-0000-0000-000000000000'),
            filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000'))

        # Requires Python 2.3
        try:
            import encodings.idna
        except:
            return
        self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
            filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
Exemplo n.º 8
0
def destroy():
    from planet import logger as log
    cache = config.cache_directory()
    index = os.path.join(cache, 'index')
    if not os.path.exists(index): return None
    idindex = filename(index, 'id')
    if os.path.exists(idindex): os.unlink(idindex)
    os.removedirs(index)
    log.info(idindex + " deleted")
Exemplo n.º 9
0
def destroy():
    from planet import logger as log
    cache = config.cache_directory()
    index=os.path.join(cache,'index')
    if not os.path.exists(index): return None
    idindex = filename(index, 'id')
    if os.path.exists(idindex): os.unlink(idindex)
    os.removedirs(index)
    log.info(idindex + " deleted")
Exemplo n.º 10
0
def open():
    try:
        cache = config.cache_directory()
        index = os.path.join(cache, 'index')
        if not os.path.exists(index): return None
        import anydbm
        return anydbm.open(filename(index, 'id'), 'w')
    except Exception, e:
        if e.__class__.__name__ == 'DBError': e = e.args[-1]
        from planet import logger as log
        log.error(str(e))
Exemplo n.º 11
0
def open():
    try:
        cache = config.cache_directory()
        index=os.path.join(cache,'index')
        if not os.path.exists(index): return None
        import dbhash
        return dbhash.open(filename(index, 'id'),'w')
    except Exception, e:
        if e.__class__.__name__ == 'DBError': e = e.args[-1]
        from planet import logger as log
        log.error(str(e))
Exemplo n.º 12
0
def open():
    try:
        cache = config.cache_directory()
        index = os.path.join(cache, "index")
        if not os.path.exists(index):
            return None
        return dbhash.open(filename(index, "id"), "w")
    except Exception, e:
        if e.__class__.__name__ == "DBError":
            e = e.args[-1]
        from planet import logger as log

        log.error(str(e))
Exemplo n.º 13
0
def run(script, doc, output_file=None, options={}):
    """ process an Genshi template """

    context = Context(**options)

    tmpl_fileobj = open(script)
    tmpl = MarkupTemplate(tmpl_fileobj, script)
    tmpl_fileobj.close()

    if not output_file: 
        # filter
        context.push({'input':XMLParser(StringIO(doc))})
    else:
        # template
        import time
        from planet import config,feedparser
        from planet.spider import filename

        # gather a list of subscriptions, feeds
        global subscriptions
        feeds = []
        sources = config.cache_sources_directory()
        for sub in config.subscriptions():
            data=feedparser.parse(filename(sources,sub))
            data.feed.config = norm(dict(config.parser.items(sub)))
            if data.feed.has_key('link'):
                feeds.append((data.feed.config.get('name',''),data.feed))
            subscriptions.append(norm(sub))
        feeds.sort()

        # annotate each entry
        new_date_format = config.new_date_format()
        vars = feedparser.parse(StringIO(doc))
        vars.feeds = [value for name,value in feeds]
        last_feed = None
        last_date = None
        for entry in vars.entries:
             entry.source.config = find_config(config, entry.source)

             # add new_feed and new_date fields
             if 'id' in entry.source:
                 entry.new_feed = entry.source.id
             else:
                 entry.new_feed = None
             entry.new_date = date = None
             if entry.has_key('published_parsed'): date=entry.published_parsed
             if entry.has_key('updated_parsed'): date=entry.updated_parsed
             if date: entry.new_date = time.strftime(new_date_format, date)

             # remove new_feed and new_date fields if not "new"
             if entry.new_date == last_date:
                 entry.new_date = None
                 if entry.new_feed == last_feed:
                     entry.new_feed = None
                 else:
                     last_feed = entry.new_feed
             elif entry.new_date:
                 last_date = entry.new_date
                 last_feed = None

             # add streams for all text constructs
             for key in entry.keys():
                 if key.endswith("_detail") and entry[key].has_key('type') and \
                     entry[key].has_key('value'):
                     streamify(entry[key],entry.source.planet_bozo)
             if entry.has_key('content'):
                 for content in entry.content:
                     streamify(content,entry.source.planet_bozo)
     
        # add cumulative feed information to the Genshi context
        vars.feed.config = dict(config.parser.items('Planet',True))
        context.push(vars)

    # apply template
    output=tmpl.generate(context).render('xml')

    if output_file:
        out_file = open(output_file,'w')
        out_file.write(output)
        out_file.close()
    else:
        return output
Exemplo n.º 14
0
def run(script, doc, output_file=None, options={}):
    """ process an Genshi template """

    context = Context(**options)

    tmpl_fileobj = open(script)
    tmpl = MarkupTemplate(tmpl_fileobj, script)
    tmpl_fileobj.close()

    if not output_file: 
        # filter
        context.push({'input':XMLParser(StringIO(doc))})
    else:
        # template
        import time
        from planet import config,feedparser
        from planet.spider import filename

        # gather a list of subscriptions, feeds
        global subscriptions
        feeds = []
        sources = config.cache_sources_directory()
        for sub in config.subscriptions():
            data=feedparser.parse(filename(sources,sub))
            data.feed.config = norm(dict(config.parser.items(sub)))
            if data.feed.has_key('link'):
                feeds.append((data.feed.config.get('name',''),data.feed))
            subscriptions.append(norm(sub))
        feeds.sort()

        # annotate each entry
        new_date_format = config.new_date_format()
        vars = feedparser.parse(StringIO(doc))
        vars.feeds = [value for name,value in feeds]
        last_feed = None
        last_date = None
        for entry in vars.entries:
             entry.source.config = find_config(config, entry.source)

             # add new_feed and new_date fields
             entry.new_feed = entry.source.id
             entry.new_date = date = None
             if entry.has_key('published_parsed'): date=entry.published_parsed
             if entry.has_key('updated_parsed'): date=entry.updated_parsed
             if date: entry.new_date = time.strftime(new_date_format, date)

             # remove new_feed and new_date fields if not "new"
             if entry.new_date == last_date:
                 entry.new_date = None
                 if entry.new_feed == last_feed:
                     entry.new_feed = None
                 else:
                     last_feed = entry.new_feed
             elif entry.new_date:
                 last_date = entry.new_date
                 last_feed = None

             # add streams for all text constructs
             for key in entry.keys():
                 if key.endswith("_detail") and entry[key].has_key('type') and \
                     entry[key].has_key('value'):
                     streamify(entry[key],entry.source.planet_bozo)
             if entry.has_key('content'):
                 for content in entry.content:
                     streamify(content,entry.source.planet_bozo)
     
        # add cumulative feed information to the Genshi context
        vars.feed.config = dict(config.parser.items('Planet',True))
        context.push(vars)

    # apply template
    output=tmpl.generate(context).render('xml')

    if output_file:
        out_file = open(output_file,'w')
        out_file.write(output)
        out_file.close()
    else:
        return output
Exemplo n.º 15
0
    # Create the blacklist dir if it does not exist
    if not os.path.exists(blacklist):
        os.mkdir(blacklist)
        print "<p>Created directory %s</p>" % blacklist

    # find list of urls, in the form bl[n]=url

    for key in form.keys():

        if not key.startswith("bl"): continue

        url = unquote(form[key].value)

        # find corresponding files
        cache_file = filename(cache, url)
        blacklist_file = filename(blacklist, url)

        # move to blacklist if found
        if os.path.exists(cache_file):

            os.rename(cache_file, blacklist_file)

            print "<p>Blacklisted <a href='%s'>%s</a></p>" % (url, url)

        else:

            print "<p>Unknown file: %s</p>" % cache_file

        print """
<p>Note that blacklisting does not automatically 
Exemplo n.º 16
0
def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True):
    from planet import logger
    import config
    try:

        import urllib2, StringIO
        from planet.spider import filename

        # list cache file name
        cache_filename = filename(config.cache_lists_directory(), list)

        # retrieve list options (e.g., etag, last-modified) from cache
        options = {}

        # add original options
        for key in orig_config.options(list):
            options[key] = orig_config.get(list, key)
            
        try:
            if use_cache:
                cached_config = ConfigParser()
                cached_config.read(cache_filename)
                for option in cached_config.options(list):
                     options[option] = cached_config.get(list,option)
        except:
            pass

        cached_config = ConfigParser()
        cached_config.add_section(list)
        for key, value in options.items():
            cached_config.set(list, key, value)

        # read list
        curdir=getattr(os.path, 'curdir', '.')
        if sys.platform.find('win') < 0:
            base = urljoin('file:', os.path.abspath(curdir))
        else:
            path = os.path.abspath(os.path.curdir)
            base = urljoin('file:///', path.replace(':','|').replace('\\','/'))

        request = urllib2.Request(urljoin(base + '/', list))
        if options.has_key("etag"):
            request.add_header('If-None-Match', options['etag'])
        if options.has_key("last-modified"):
            request.add_header('If-Modified-Since',
                options['last-modified'])
        response = urllib2.urlopen(request)
        if response.headers.has_key('etag'):
            cached_config.set(list, 'etag', response.headers['etag'])
        if response.headers.has_key('last-modified'):
            cached_config.set(list, 'last-modified',
                response.headers['last-modified'])

        # convert to config.ini
        data = StringIO.StringIO(response.read())

        if callback: callback(data, cached_config)

        # write to cache
        if use_cache:
            cache = open(cache_filename, 'w')
            cached_config.write(cache)
            cache.close()

        # re-parse and proceed
        logger.debug("Using %s readinglist", list) 
        if re_read:
            if use_cache:  
                orig_config.read(cache_filename)
            else:
                cdata = StringIO.StringIO()
                cached_config.write(cdata)
                cdata.seek(0)
                orig_config.readfp(cdata)
    except:
        try:
            if re_read:
                if use_cache:  
                    if not orig_config.read(cache_filename): raise Exception()
                else:
                    cdata = StringIO.StringIO()
                    cached_config.write(cdata)
                    cdata.seek(0)
                    orig_config.readfp(cdata)
                logger.info("Using cached %s readinglist", list)
        except:
            logger.exception("Unable to read %s readinglist", list)
Exemplo n.º 17
0
# load config files (default: config.ini)
for arg in sys.argv[1:]:
    config.load(arg)
if len(sys.argv) == 1:
    config.load('config.ini')

from Queue import Queue
from threading import Thread

# determine which subscriptions have no icon but do have a html page
fetch_queue = Queue()
html = ['text/html', 'application/xhtml+xml']
sources = config.cache_sources_directory()
for sub in config.subscriptions():
    data = feedparser.parse(filename(sources, sub))
    if data.feed.get('icon'): continue
    if not data.feed.get('links'): continue
    for link in data.feed.links:
        if link.rel == 'alternate' and link.type in html:
            fetch_queue.put((sub, link.href))
            break


# find the favicon for a given webpage
def favicon(page):
    parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(urlopen(page))
    favicon = urljoin(page, '/favicon.ico')
    for link in doc.getElementsByTagName('link'):
        if link.hasAttribute('rel') and link.hasAttribute('href'):
Exemplo n.º 18
0
def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True):
    from planet import logger
    import config
    try:

        import urllib2, StringIO
        from planet.spider import filename

        # list cache file name
        cache_filename = filename(config.cache_lists_directory(), list)

        # retrieve list options (e.g., etag, last-modified) from cache
        options = {}

        # add original options
        for key in orig_config.options(list):
            options[key] = orig_config.get(list, key)
            
        try:
            if use_cache:
                cached_config = ConfigParser()
                cached_config.read(cache_filename)
                for option in cached_config.options(list):
                     options[option] = cached_config.get(list,option)
        except:
            pass

        cached_config = ConfigParser()
        cached_config.add_section(list)
        for key, value in options.items():
            cached_config.set(list, key, value)

        # read list
        curdir=getattr(os.path, 'curdir', '.')
        if sys.platform.find('win') < 0:
            base = urljoin('file:', os.path.abspath(curdir))
        else:
            path = os.path.abspath(os.path.curdir)
            base = urljoin('file:///', path.replace(':','|').replace('\\','/'))

        request = urllib2.Request(urljoin(base + '/', list))
        if options.has_key("etag"):
            request.add_header('If-None-Match', options['etag'])
        if options.has_key("last-modified"):
            request.add_header('If-Modified-Since',
                options['last-modified'])
        response = urllib2.urlopen(request)
        if response.headers.has_key('etag'):
            cached_config.set(list, 'etag', response.headers['etag'])
        if response.headers.has_key('last-modified'):
            cached_config.set(list, 'last-modified',
                response.headers['last-modified'])

        # convert to config.ini
        data = StringIO.StringIO(response.read())

        if callback: callback(data, cached_config)

        # write to cache
        if use_cache:
            cache = open(cache_filename, 'w')
            cached_config.write(cache)
            cache.close()

        # re-parse and proceed
        logger.debug("Using %s readinglist", list) 
        if re_read:
            if use_cache:  
                orig_config.read(cache_filename)
            else:
                cdata = StringIO.StringIO()
                cached_config.write(cdata)
                cdata.seek(0)
                orig_config.readfp(cdata)
    except:
        try:
            if re_read:
                if use_cache:  
                    if not orig_config.read(cache_filename): raise Exception()
                else:
                    cdata = StringIO.StringIO()
                    cached_config.write(cdata)
                    cdata.seek(0)
                    orig_config.readfp(cdata)
                logger.info("Using cached %s readinglist", list)
        except:
            logger.exception("Unable to read %s readinglist", list)
Exemplo n.º 19
0
  # Create the blacklist dir if it does not exist
  if not os.path.exists(blacklist):
    os.mkdir(blacklist)
    print "<p>Created directory %s</p>" % blacklist
  
  # find list of urls, in the form bl[n]=url

  for key in form.keys():

    if not key.startswith("bl"): continue

    url = unquote(form[key].value)

    # find corresponding files
    cache_file = filename(cache, url)
    blacklist_file = filename(blacklist, url)

    # move to blacklist if found
    if os.path.exists(cache_file):

      os.rename(cache_file, blacklist_file)

      print "<p>Blacklisted <a href='%s'>%s</a></p>" % (url, url)

    else:

      print "<p>Unknown file: %s</p>" % cache_file

    print """
<p>Note that blacklisting does not automatically