예제 #1
0
 def init_url_queue(self):
     global UserInfo_store
     for uid_or_nickname in self.uid_or_uname_list:            
         if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
         len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
             continue
        
         self.url_queue.put(uid_or_nickname)
     print "crawl size ::::::::: ", self.url_queue.qsize()
     pass
예제 #2
0
    def crawl(self, uid_or_nickname, is_again=False):
        
        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        url = ''
        if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
            len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
            WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname)
            return "nothing"
        
        quote_uid_or_nickname = ""
        try:
            quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip()))
        except:
            print  traceback.format_exc()
            print  uid_or_nickname
        
#         url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
            
        if quote_uid_or_nickname == uid_or_nickname:
            url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
        else:
            url = "http://weibo.cn/n/" + quote_uid_or_nickname
        
        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        
        craw_object = Crawler_with_proxy(url, cookie, proxy)
        
        WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
        
        user_info = ""
        try:
            page = craw_object.get_page()
            
            user_info = page_parser_from_search_for_UserInfo(page, url)
        except:
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                return user_info
            
            
        return user_info
예제 #3
0
# -*- coding: utf-8 -*-
'''
Created on 2016年5月2日

@author: nlp
'''
from store_model import UserInfo_store




if __name__ == '__main__':
    
    print len( UserInfo_store.objects( uid_or_uname = str( "2080114694" ) ) )
    pass
예제 #4
0
# -*- coding: utf-8 -*-
'''
Created on 2016年5月2日

@author: nlp
'''
from store_model import UserInfo_store

if __name__ == '__main__':

    print len(UserInfo_store.objects(uid_or_uname=str("2080114694")))
    pass