def run(self, **kwargs): if not all(map(lambda x: x in kwargs, ['title', 'route', 'kwargs'])): raise Exception('invalid schedule %s' % kwargs) doc = { "title": kwargs.pop('title'), "route": kwargs.pop('route'), "created": time.time(), "incremental": kwargs.pop('incremental', False), "kwargs": kwargs.pop('kwargs'), } if doc['title'] in self.collection.distinct('title'): raise Exception('Existed schedule title %s' % doc['title']) _id = self.collection.insert(doc) doc_seed = { "_scheduleId": _id, "_status": None, "_seedId": None, "_ancestors": [], "route": doc['route'], "incremental": doc['incremental'], "created": time.time(), "updated": time.time(), "kwargs": doc['kwargs'] } _id = env.MONGO.seeds.insert(doc_seed) Route().run(doc['route'], _id, 'seeds')
def gen_next_job(self, next_route, _id): Route().run(next_route, _id, 'webpages')
def gen_next_job(self, route, _id): Route().run(self.next_job, _id, 'webpages')
def start_ml(): _ids = env.MONGO.streams.distinct('_id') route = 'ml.weibo.statuses' for _id in _ids: Route().run(route, _id, 'streams')
def start_etl(): _ids = env.MONGO.webpages.distinct('_id') route = 'etl.extractors.weibo.statuses' for _id in _ids: Route().run(route, _id, 'webpages')
def start_crawler(): _id = env.MONGO.seeds.distinct('_id')[0] Route().run('crawler.weibo.statuses', _id, 'seeds')
def gen_next_job(self, _id): Route().run(self.next_job, _id, 'streams')