Пример #1
0
 def test_add(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     urls = ['http://www.sample{0}.com'.format(x) for x in xrange(5)]
     f.add(urls[0])
     f.add(urls[1:])
     for each in urls:
         self.assertTrue(each in f)
Пример #2
0
 def test_setargs(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     args = {'rules': ['^http://', '[0-9]*']}
     f.setargs(args)
     self.assertEqual(2, len(f.args['rules']))
     args = {'rules': ['[0-9]', '[--']}
     self.assertRaises(FrontierException, f.setargs, args)
Пример #3
0
 def test__feedfilter(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     f.clean('todo', 'visited')
     f.redis.rpush(f.visited, 'sample1')
     f.redis.rpush(f.visited, 'sample2')
     f.redis.rpush(f.visited, 'sample3')
     f._feedfilter()
     self.assertEqual(3, f.filter.count)
     f.clean('todo', 'visited')
     del f
Пример #4
0
 def test__nextall(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     before = f.filter.count
     items = f._nextall()
     after = f.filter.count
     self.assertEqual(0, len(f))
     self.assertEqual(len(items), after-before)
     if items:
         for each in items:
             self.assertTrue(f.isVisited(each))
Пример #5
0
 def test__nextone(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     before = len(f)
     if before == 0:
         f.add('url-for-test-nextone')
         before = 1
     item = f._nextone()
     after = len(f)
     self.assertIsInstance(item, str)
     self.assertEqual(1, f.filter.count)
     self.assertEqual(1, before-after)
     self.assertTrue(f.isVisited(item))
Пример #6
0
 def test_clean(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     f.clean('todo', 'visited')
     self.assertEqual(0, len(f))
     self.assertEqual(0, f.redis.llen(f.visited))
     f.add(['url1', 'url2'])
     self.assertEqual(2, len(f))
     f.next(0)
     self.assertEqual(2, f.redis.llen(f.visited))
     f.add('should left')
     f.clean('visited')
     self.assertEqual(0, f.redis.llen(f.visited))
     self.assertEqual(1, len(f))
Пример #7
0
 def _build(self, config):
     try:
         self.name = config['name']
         Logger.register(self.name)
         Logger.load()
         self.logger = Logger(self.name)
         self.debug = config.get('debug', True)
         self.logger.info(self.name, 'Start building...')
         self.scraper = Scraper.get(config['scraper']['name'])(self)
         if 'args' in config['scraper']:
             self.scraper.setargs(config['scraper']['args'])
         self.frontier = Frontier.get(config['frontier']['name'])(self)
         if 'args' in config['frontier']:
             self.frontier.setargs(config['frontier']['args'])
         for each in config['handlers']:
             handler = Handler.get(each['name'])(self)
             if 'args' in each:
                 handler.setargs(each['args'])
             self.handlers.append(handler)
         self.logger.info(self.name, 'Build successful!')
     except KeyError as e:
         raise PyCrawlerException('Key \''+e.args[0]+'\' missing in config dict')
Пример #8
0
 def test_validate(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     args = {'rules': [
         '((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?']}
     f.setargs(args)
     self.assertTrue(f.validate('http://www.baidu.com'))
Пример #9
0
 def test__addone(self):
     f = Frontier.get('BFSFrontier')(SpiderTest('testspider'))
     url = 'http://www.google.com'
     f._addone(url)
     self.assertTrue(url in f)