Пример #1
0
def main(option, *args):
    if option not in ['start', 'report', 'extract', 'recover']:
        print('Option not supported.')
        return
    if option == 'extract':
        if len(args) != 2:
            print('Usage: python WSJCrawler.py extract logfile outfile')
        else:
            extractlog(args[0], args[1])
        return
    driver = Driver(SETTINGS)
    if option == 'start':
        inckey = loadkeywords('./data/inc.txt')
        url1, url2 = [], []
        years = list(range(2005, 2015))
        for each in iter(inckey):
            url1.extend(generateseeds(each, years))
            url1.extend(generateseeds(each, [2015], [1, 2, 3, 4]))
        wordkey = loadkeywords('./data/word.txt')
        for each in iter(wordkey):
            url2.extend(generateseeds(each, years))
            url2.extend(generateseeds(each, [2015], [1, 2, 3, 4]))
        driver.addtask('IncSpider', url1)
        driver.addtask('WordSpider', url2)
        driver.start()
    elif option == 'report':
        driver.report()
    elif option == 'recover':
        if len(args) != 2:
            print('Usage: python WSJCrawler.py recover spidername urlfile')
        else:
            driver.recover(args[0], args[1])