def main(source_, citylist_, period_): global error_cities global error_messages source = source_ citylist = citylist_ period = period_ if source: source = source.split(",") print "source: ", source if citylist: citylist = citylist.split(",") print "city list: ", citylist while True: if source: app = AsyncPipeline() app.start(source, citylist) else: break # rescue if len(error_cities) > 0: for src in error_cities.iterkeys(): print "Try to rescue", src remain_cities = error_cities[src] error_cities[src] = [] error_messages[src] = [] app = AsyncPipeline() app.start([src], remain_cities) # archive first archiver = Archiver() for src in source: archiver.archive(src, src, True) # False achive locally, True achive to S3 # repeat if not period: break time.sleep( int(period) * 3600 ) error_cities = {} error_messages = {} # check config stop_crawl = 0 check_config = CheckConfig() config = check_config.check('crawl_config') for src in source: if src in config: if "period" in config[src]: period = config[src]["period"] if "stop" in config[src]: stop_crawl = config[src]["stop"] break if stop_crawl == 1: break
def main(source_, citylist_, period_): source = source_ citylist = citylist_ period = period_ if source: source = source.split(",") print "source: ", source if citylist: citylist = citylist.split(",") print "city list: ", citylist while True: if not source: break sleep_interval = (0,30) if "meituan" in source: meituan_app = Pipeline(MeituanCrawler(), MeituanParser(), None, "meituan") if not citylist: error = meituan_app.start(sleep_interval) if len(error) > 0: meituan_app.rescue(error, sleep_interval) else: meituan_app.rescue(citylist, sleep_interval) if "nuomi" in source: nuomi_app = Pipeline(NuomiCrawler(), NuomiParser(), None, "nuomi") if not citylist: error = nuomi_app.start(sleep_interval) if len(error) > 0: nuomi_app.rescue(error, sleep_interval) else: nuomi_app.rescue(citylist, sleep_interval) if "lashou" in source: lashou_app = Pipeline(LashouCrawler(), LashouParser(), None, "lashou") if not citylist: error = lashou_app.start(sleep_interval) if len(error) > 0: lashou_app.rescue(error, sleep_interval) else: lashou_app.rescue(citylist, sleep_interval) if "wowo" in source: wowo_app = Pipeline(WowoCrawler(), WowoParser(), None, "wowo") if not citylist: error = wowo_app.start(sleep_interval) if len(error) > 0: wowo_app.rescue(error, sleep_interval) else: wowo_app.rescue(citylist, sleep_interval) if "dida" in source: dida_app = Pipeline(DidaCrawler(), DidaParser(), None, "dida") if not citylist: error = dida_app.start(sleep_interval) if len(error) > 0: dida_app.rescue(error, sleep_interval) else: dida_app.rescue(citylist, sleep_interval) if "dianping" in source: dianping_app = Pipeline(DianpingCrawler(), DianpingParser(), None, "dianping") if not citylist: error = dianping_app.start(sleep_interval) if len(error) > 0: dianping_app.rescue(error, sleep_interval) else: dianping_app.rescue(citylist, sleep_interval) if "manzuo" in source: manzuo_app = Pipeline(ManzuoCrawler(), ManzuoParser(), None, "manzuo") if not citylist: error = manzuo_app.start(sleep_interval) if len(error) > 0: manzuo_app.rescue(error, sleep_interval) else: manzuo_app.rescue(citylist, sleep_interval) if "ftuan" in source: ftuan_app = Pipeline(FtuanCrawler(), FtuanParser(), None, "ftuan") if not citylist: error = ftuan_app.start(sleep_interval) if len(error) > 0: ftuan_app.rescue(error, sleep_interval) else: ftuan_app.rescue(citylist, sleep_interval) if "wuba" in source: wuba_app = Pipeline(WubaCrawler(), WubaParser(), None, "wuba") if not citylist: error = wuba_app.start(sleep_interval) if len(error) > 0: wuba_app.rescue(error, sleep_interval) else: wuba_app.rescue(citylist, sleep_interval) # archive first archiver = Archiver() for src in source: archiver.archive(src, src, True) # False achive locally, True achive to S3 # repeat if not period: break time.sleep( int(period) * 3600 ) # check config file stop_crawl = 0 check_config = CheckConfig() config = check_config.check('crawl_config') for src in source: if src in config: if "period" in config[src]: period = config[src]["period"] if "stop" in config[src]: stop_crawl = config[src]["stop"] break if stop_crawl == 1: break