def local(db='file', folder=None, uids=[]): global give_ups create = create_cookie_file() fetcher = CnFetcher(account, pwd, cookie_file if not create else None) if create: fetcher.login(cookie_filename=cookie_file) while give_ups > 0: while len(tokens) == 0: if give_ups > 0: pass else: return token = tokens.pop() cb = callback(token) if len(uids) == 0: give_ups = 0 else: uid = uids.pop() try: crawler = UserCrawler(uid, is_uid=True, fetcher=fetcher, fetch_fans=False, callbacks=cb, span=False) uid = crawler.uid if db == 'file' and folder is not None: storage = FileStorage(uid, folder) elif db == 'mongo': storage = MongoStorage(uid) else: raise ValueError('db must be "file" or "mongo", ' + 'when is "file", you must define folder parameter.') if storage.crawled: storage.complete() cb() continue else: crawler.set_storage(storage) crawler.start() except Exception, e: cb() # raise e logger.exception(e)
def local(uids=[]): fetcher = CnFetcher() fetcher.login() connection_error = False while len(uids) > 0 or connection_error: if not connection_error: uid = uids.pop() try: crawler = UserCrawler(uid, fetcher) crawler.run() connection_error = False except URLError, e: logger.exception(e) connection_error = True time.sleep(10)
def dc(): def run_callbacks(callbacks): for callback in callbacks: callback() global give_ups try: create = create_cookie_file() fetcher = CnFetcher(account, pwd, cookie_file if not create else None) if create: fetcher.login(cookie_filename=cookie_file) while give_ups > 0: n = 0 while len(tokens) == 0: if give_ups > 0: n += 1 time.sleep(n); else: return token = tokens.pop() cb = callback(token) soc = create_socket() try: data = json.loads(soc.recv(buf_size)) if data == None: time.sleep(15) cb() continue elif len(data) == 0: give_ups -= 1 continue user = data['user'] is_uid = data['is_uid'] crawled = data.get('crawled', False) follow = data.get('follow', None) # monitor callback register_heartbeat(user)() register_rm_cb = register_heartbeat(user, True) # success callbacks success_callbacks = (register_rm_cb, reset_error_callback) error_callbacks = (error_callback, register_rm_cb) try: crawler = UserCrawler(user, is_uid=is_uid, fetcher=fetcher, fetch_fans=follow is None, callbacks=cb, success_callbacks=success_callbacks, error_callbacks=error_callbacks) # the user not exist if crawler.user_not_exist or crawler.uid == 'attention': cb() run_callbacks(success_callbacks) continue uid = crawler.uid storage = MongoStorage(uid, follow, user=user) if crawled or storage.crawled: cb() run_callbacks(success_callbacks) storage.close() continue else: crawler.set_storage(storage) crawler.start() except Exception, e: cb() run_callbacks(error_callbacks) # raise e logger.exception(e) finally: soc.close() finally: # When run over, call stop heartbeat stop_heartbeat()