def local(db='file', folder=None, uids=[]): global give_ups create = create_cookie_file() fetcher = CnFetcher(account, pwd, cookie_file if not create else None) if create: fetcher.login(cookie_filename=cookie_file) while give_ups > 0: while len(tokens) == 0: if give_ups > 0: pass else: return token = tokens.pop() cb = callback(token) if len(uids) == 0: give_ups = 0 else: uid = uids.pop() try: crawler = UserCrawler(uid, is_uid=True, fetcher=fetcher, fetch_fans=False, callbacks=cb, span=False) uid = crawler.uid if db == 'file' and folder is not None: storage = FileStorage(uid, folder) elif db == 'mongo': storage = MongoStorage(uid) else: raise ValueError('db must be "file" or "mongo", ' + 'when is "file", you must define folder parameter.') if storage.crawled: storage.complete() cb() continue else: crawler.set_storage(storage) crawler.start() except Exception, e: cb() # raise e logger.exception(e)
def local(uids=[]): fetcher = CnFetcher() fetcher.login() connection_error = False while len(uids) > 0 or connection_error: if not connection_error: uid = uids.pop() try: crawler = UserCrawler(uid, fetcher) crawler.run() connection_error = False except URLError, e: logger.exception(e) connection_error = True time.sleep(10)
def dc(): def run_callbacks(callbacks): for callback in callbacks: callback() global give_ups try: create = create_cookie_file() fetcher = CnFetcher(account, pwd, cookie_file if not create else None) if create: fetcher.login(cookie_filename=cookie_file) while give_ups > 0: n = 0 while len(tokens) == 0: if give_ups > 0: n += 1 time.sleep(n); else: return token = tokens.pop() cb = callback(token) soc = create_socket() try: data = json.loads(soc.recv(buf_size)) if data == None: time.sleep(15) cb() continue elif len(data) == 0: give_ups -= 1 continue user = data['user'] is_uid = data['is_uid'] crawled = data.get('crawled', False) follow = data.get('follow', None) # monitor callback register_heartbeat(user)() register_rm_cb = register_heartbeat(user, True) # success callbacks success_callbacks = (register_rm_cb, reset_error_callback) error_callbacks = (error_callback, register_rm_cb) try: crawler = UserCrawler(user, is_uid=is_uid, fetcher=fetcher, fetch_fans=follow is None, callbacks=cb, success_callbacks=success_callbacks, error_callbacks=error_callbacks) # the user not exist if crawler.user_not_exist or crawler.uid == 'attention': cb() run_callbacks(success_callbacks) continue uid = crawler.uid storage = MongoStorage(uid, follow, user=user) if crawled or storage.crawled: cb() run_callbacks(success_callbacks) storage.close() continue else: crawler.set_storage(storage) crawler.start() except Exception, e: cb() run_callbacks(error_callbacks) # raise e logger.exception(e) finally: soc.close() finally: # When run over, call stop heartbeat stop_heartbeat()
class UserCrawler(threading.Thread): def __init__(self, user, is_uid=None, storage=None, fetcher=None, fetch_fans=True, span=True, # dozens of callbacks callbacks=None, success_callbacks=None, error_callbacks=None, ): super(UserCrawler, self).__init__() logger.info('fetch user: %s' % user) if is_uid is True: self.uid = user elif is_uid is False: self.uid = None else: try: int(user) self.uid = user except ValueError: self.uid = None if self.uid is not None: self.url = 'http://weibo.cn/u/%s' % self.uid else: self.url = 'http://weibo.cn/%s' % user if fetcher is None: self.fetcher = CnFetcher(account, pwd) self.fetcher.login() else: self.fetcher = fetcher self.storage = storage self.user_not_exist = False html = self._fetch(self.url) if html is None: self.user_not_exist = True elif self.uid is None: parser = CnWeiboParser(html, user, self.storage) self.uid = parser.get_uid() self.fetch_fans = fetch_fans self.span = span self.error = False self.callbacks = callbacks self.success_callbacks = success_callbacks self.error_callbacks = error_callbacks def _check_user_exist(self, html): # If user not exist or forbiddened by weibo, directly return False if u'抱歉,您当前访问的用户状态异常,暂时无法访问。' in html: self.error = True self.user_not_exist = True return False return True def _fetch(self, url): html = self.fetcher.fetch(url) if not self._check_user_exist(html): return right = check_page_right(html) tries = 0 while not right and tries <= 6: time.sleep(10) self.fetcher.login() sec = 10 * (tries + 1) if tries <= 2 else ( 600 * (tries - 2) if tries < 6 else 3600) time.sleep(sec) html = self.fetcher.fetch(url) if not self._check_user_exist(html): return right = check_page_right(html) if right: return html tries += 1 else: return html self.error = True @property def info_link(self): return 'http://weibo.cn/%s/info' % self.uid @property def follow_link(self): return 'http://weibo.cn/%s/follow' % self.uid @property def fav_link(self): return 'http://weibo.cn/%s/fans' % self.uid def set_storage(self, storage): self.storage = storage def _crawl(self, url, parser_cls): def start(url): html = self._fetch(url) parser = parser_cls(html, self.uid, self.storage) return parser.parse() error = None for i in range(3): try: return start(url) except urllib2.HTTPError, e: if e.code == 404: self.error = True continue else: error = e continue except urllib2.URLError, e: error = e continue time.sleep(i * 5)