def test_add(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) urls = ['http://www.sample{0}.com'.format(x) for x in xrange(5)] f.add(urls[0]) f.add(urls[1:]) for each in urls: self.assertTrue(each in f)
def test_setargs(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) args = {'rules': ['^http://', '[0-9]*']} f.setargs(args) self.assertEqual(2, len(f.args['rules'])) args = {'rules': ['[0-9]', '[--']} self.assertRaises(FrontierException, f.setargs, args)
def test__feedfilter(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) f.clean('todo', 'visited') f.redis.rpush(f.visited, 'sample1') f.redis.rpush(f.visited, 'sample2') f.redis.rpush(f.visited, 'sample3') f._feedfilter() self.assertEqual(3, f.filter.count) f.clean('todo', 'visited') del f
def test__nextall(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) before = f.filter.count items = f._nextall() after = f.filter.count self.assertEqual(0, len(f)) self.assertEqual(len(items), after-before) if items: for each in items: self.assertTrue(f.isVisited(each))
def test__nextone(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) before = len(f) if before == 0: f.add('url-for-test-nextone') before = 1 item = f._nextone() after = len(f) self.assertIsInstance(item, str) self.assertEqual(1, f.filter.count) self.assertEqual(1, before-after) self.assertTrue(f.isVisited(item))
def test_clean(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) f.clean('todo', 'visited') self.assertEqual(0, len(f)) self.assertEqual(0, f.redis.llen(f.visited)) f.add(['url1', 'url2']) self.assertEqual(2, len(f)) f.next(0) self.assertEqual(2, f.redis.llen(f.visited)) f.add('should left') f.clean('visited') self.assertEqual(0, f.redis.llen(f.visited)) self.assertEqual(1, len(f))
def _build(self, config): try: self.name = config['name'] Logger.register(self.name) Logger.load() self.logger = Logger(self.name) self.debug = config.get('debug', True) self.logger.info(self.name, 'Start building...') self.scraper = Scraper.get(config['scraper']['name'])(self) if 'args' in config['scraper']: self.scraper.setargs(config['scraper']['args']) self.frontier = Frontier.get(config['frontier']['name'])(self) if 'args' in config['frontier']: self.frontier.setargs(config['frontier']['args']) for each in config['handlers']: handler = Handler.get(each['name'])(self) if 'args' in each: handler.setargs(each['args']) self.handlers.append(handler) self.logger.info(self.name, 'Build successful!') except KeyError as e: raise PyCrawlerException('Key \''+e.args[0]+'\' missing in config dict')
def test_validate(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) args = {'rules': [ '((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?']} f.setargs(args) self.assertTrue(f.validate('http://www.baidu.com'))
def test__addone(self): f = Frontier.get('BFSFrontier')(SpiderTest('testspider')) url = 'http://www.google.com' f._addone(url) self.assertTrue(url in f)