def test_addtask(self): d = Driver(DRIVER) s = d.getspider('Spider1') s.frontier.clean('todo', 'visited') before = len(s.frontier) d.addtask('Spider1', 'http://www.nevervisited.com') after = len(s.frontier) self.assertEqual(1, after-before) s.frontier.clean('todo', 'visited') del d
def main(option, *args): if option not in ['start', 'report', 'extract', 'recover']: print('Option not supported.') return if option == 'extract': if len(args) != 2: print('Usage: python WSJCrawler.py extract logfile outfile') else: extractlog(args[0], args[1]) return driver = Driver(SETTINGS) if option == 'start': inckey = loadkeywords('./data/inc.txt') url1, url2 = [], [] years = list(range(2005, 2015)) for each in iter(inckey): url1.extend(generateseeds(each, years)) url1.extend(generateseeds(each, [2015], [1, 2, 3, 4])) wordkey = loadkeywords('./data/word.txt') for each in iter(wordkey): url2.extend(generateseeds(each, years)) url2.extend(generateseeds(each, [2015], [1, 2, 3, 4])) driver.addtask('IncSpider', url1) driver.addtask('WordSpider', url2) driver.start() elif option == 'report': driver.report() elif option == 'recover': if len(args) != 2: print('Usage: python WSJCrawler.py recover spidername urlfile') else: driver.recover(args[0], args[1])
def test_start(self): d = Driver(DRIVER) d.getspider('Spider1').frontier.clean('todo', 'visited') d.getspider('Spider2').frontier.clean('todo', 'visited') urls = ['http://www.baidu.com', 'http://www.zhihu.com', 'http://www.renren.com'] d.addtask('Spider1', urls) d.addtask('Spider2', urls[0]) d.start() d.pause() time.sleep(1) d.resume() time.sleep(2) d.stop() d.getspider('Spider1').frontier.clean('todo', 'visited') d.getspider('Spider2').frontier.clean('todo', 'visited')
def test_getspider(self): d = Driver(DRIVER) self.assertIsInstance(d.getspider('Spider1'), Spider) self.assertIsNone(d.getspider('none')) del d
def test_addspider(self): d = Driver(DRIVER) self.assertRaises(PyCrawlerException, d.addspider, 'none') d.addspider(Spider(SPIDER)) self.assertEqual(3, len(d)) del d
def test__build(self): d = Driver(DRIVER) self.assertEqual(2, len(d)) self.assertEqual('Spider1', d.getspider('Spider1').name) self.assertEqual('Spider2', d.getspider('Spider2').name) del d