def init_url_queue(self): global UserInfo_store for uid_or_nickname in self.uid_or_uname_list: if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: continue self.url_queue.put(uid_or_nickname) print "crawl size ::::::::: ", self.url_queue.qsize() pass
def crawl(self, uid_or_nickname, is_again=False): # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ url = '' if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname) return "nothing" quote_uid_or_nickname = "" try: quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip())) except: print traceback.format_exc() print uid_or_nickname # url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" if quote_uid_or_nickname == uid_or_nickname: url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" else: url = "http://weibo.cn/n/" + quote_uid_or_nickname # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) user_info = "" try: page = craw_object.get_page() user_info = page_parser_from_search_for_UserInfo(page, url) except: if is_again: return self.crawl(url, is_again=False) else: return user_info return user_info
# -*- coding: utf-8 -*- ''' Created on 2016年5月2日 @author: nlp ''' from store_model import UserInfo_store if __name__ == '__main__': print len( UserInfo_store.objects( uid_or_uname = str( "2080114694" ) ) ) pass
# -*- coding: utf-8 -*- ''' Created on 2016年5月2日 @author: nlp ''' from store_model import UserInfo_store if __name__ == '__main__': print len(UserInfo_store.objects(uid_or_uname=str("2080114694"))) pass