示例#1
0
文件: run.py 项目: xiaohui2856/crawl
def main():
    config_logging()

    if not os.path.exists(settings.json_restore_path):
        CrawlerUtils.make_dir(settings.json_restore_path)

    cur_date = CrawlerUtils.get_cur_y_m_d()
    set_codecracker()

    if len(sys.argv) >= 2 and sys.argv[1] == "check":
        dt = None
        if len(sys.argv) == 3:
            dt = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d")
        checker = Checker(dt)
        checker.run()
        return

    if len(sys.argv) < 3:
        print 'usage: run.py [check] [max_crawl_time(minutes) province...] \n\tmax_crawl_time 最大爬取秒数,以秒计;\n\tprovince 是所要爬取的省份列表 用空格分开, all表示爬取全部)'
        return

    try:
        max_crawl_time = int(sys.argv[1])
        settings.max_crawl_time = datetime.timedelta(minutes=max_crawl_time)
    except ValueError as e:
        settings.logger.error('invalid max_crawl_time, should be a integer')
        os._exit(1)

    timer = threading.Timer(max_crawl_time, force_exit)
    timer.start()

    settings.logger.info(u'即将开始爬取,最长爬取时间为 %s 秒' % settings.max_crawl_time)
    settings.start_crawl_time = datetime.datetime.now()

    if sys.argv[2] == 'all':
        args = [p for p in sorted(province_crawler.keys())]
        process_pool = MyPool()
        process_pool.map(crawl_province, args)
        process_pool.close()
        settings.logger.info("wait processes....")
        process_pool.join()
    else:
        provinces = sys.argv[2:]
        for p in provinces:
            if not p in province_crawler.keys():
                settings.logger.warn('province %s is not supported currently' %
                                     p)
                continue

            crawl_province(p)
示例#2
0
文件: run.py 项目: xiaohui2856/crawl
    'chongqing': ChongqingClawer,
    'zhejiang': ZhejiangCrawler,
    'liaoning': LiaoningCrawler,
    'gansu': GansuClawer,
    'guangxi': GuangxiCrawler,
    'shanxi': ShanxiCrawler,
    'qinghai': QinghaiCrawler,
    'hubei': HubeiCrawler,
    'guizhou': GuizhouCrawler,
    'jilin': JilinCrawler,
    'hainan': HainanCrawler,
    'xizang': XizangCrawler,
}

process_pool = None
cur_date = CrawlerUtils.get_cur_y_m_d()


def set_codecracker():
    for province in sorted(province_crawler.keys()):
        try:
            province_crawler.get(province).code_cracker = CaptchaRecognition(
                province)
        except Exception, e:
            settings.logger.warn("init captcha recognition of %s", province)


def config_logging():
    settings.logger = logging.getLogger('enterprise-crawler')
    settings.logger.setLevel(settings.log_level)
    fh = logging.FileHandler(settings.log_file)