def main(option, *args): if option not in ['start', 'report', 'extract', 'recover']: print('Option not supported.') return if option == 'extract': if len(args) != 2: print('Usage: python WSJCrawler.py extract logfile outfile') else: extractlog(args[0], args[1]) return driver = Driver(SETTINGS) if option == 'start': inckey = loadkeywords('./data/inc.txt') url1, url2 = [], [] years = list(range(2005, 2015)) for each in iter(inckey): url1.extend(generateseeds(each, years)) url1.extend(generateseeds(each, [2015], [1, 2, 3, 4])) wordkey = loadkeywords('./data/word.txt') for each in iter(wordkey): url2.extend(generateseeds(each, years)) url2.extend(generateseeds(each, [2015], [1, 2, 3, 4])) driver.addtask('IncSpider', url1) driver.addtask('WordSpider', url2) driver.start() elif option == 'report': driver.report() elif option == 'recover': if len(args) != 2: print('Usage: python WSJCrawler.py recover spidername urlfile') else: driver.recover(args[0], args[1])