Exemplo n.º 1
0
    def crawl(self, url, is_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        uid_or_uname = ""
        try:
            page = craw_object.get_page()

            uid_or_uname = page_parser_from_search_for_uid(page)
        except:
            print traceback.format_exc()
            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return uid_or_uname
        return uid_or_uname
Exemplo n.º 2
0
    def crawl(self):
        # 实时且原创的微博
        self.data['advancedfilter'] = '1'
        self.data['keyword'] = self.keyword
        self.data['hasori'] = '1' 
#         self.data['nick'] =  '' 
#         self.data['starttime'] =  '' 
#         self.data['endtime'] =  '' 
        self.data['sort'] = 'time' 
        self.data['smblog'] = '搜索' 
        url = 'http://weibo.cn/search/'
        
        loginer = Loginer()
        cookie = loginer.get_cookie();
        proxy = loginer.get_proxy();
        craw_object = Crawler_with_proxy(url, cookie, proxy)
        
        weibo_list = []
        try:
            page = craw_object.get_page_with_form(self.data)
            weibo_list = page_parser_from_search(page)
        except :
            print traceback.format_exc()
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()
        
        if len(weibo_list) == 0:
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()
        else:
            return weibo_list
Exemplo n.º 3
0
 def crawl(self, url, is_again=True):
     loginer = Loginer()
     cookie = loginer.get_cookie()
     proxy = loginer.get_proxy()
     craw_object = Crawler_with_proxy(url, cookie, proxy)
     
     WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
     
     uid_or_uname = ""
     try:
         page = craw_object.get_page()
         
         uid_or_uname = page_parser_from_search_for_uid(page)
     except:
         print traceback.format_exc()
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return uid_or_uname
     return uid_or_uname
Exemplo n.º 4
0
    def crawl(self, url, is_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        weibo_list = []
        try:
            page = craw_object.get_page()
            weibo_list = page_parser_from_search(page)
        except:
            print traceback.format_exc()
            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return weibo_list

        if len(weibo_list) == 0:
            if zero_aviable_check_validity(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " get nothing, sina does not have ! " + url)
                return weibo_list
            if weibo_guangchang_forbidden(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " get nothing, forbidden ! ! " + url)

            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " get nothing, change proxy ! " + url)
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return weibo_list

        else:
            if int(url[url.rfind('=') + 1:]) == 1:
                total_num = weibo_list[0].all_weibo_num
                self.put_second_and_more_url_queue(total_num, url)
            WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                         " crawl success! " +
                                                         url)
            return weibo_list
Exemplo n.º 5
0
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        comment_list = []
        page = ""
        try:
            page = craw_object.get_page()
            comment_list = page_parser_from_search_for_comment(
                page)  # 解析页面,生成一条条的 comment
        except:
            print traceback.format_exc()
            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list

        if len(comment_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            if no_one_commented(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " nobody commented !")
                return comment_list

            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " comment_list is 0 , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list
        else:
            return comment_list
Exemplo n.º 6
0
 def crawl(self, url, is_again=True):
     loginer = Loginer()
     cookie = loginer.get_cookie()
     proxy = loginer.get_proxy()
     craw_object = Crawler_with_proxy(url, cookie, proxy)
     
     WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
     
     weibo_list = []
     try:
         page = craw_object.get_page()
         weibo_list = page_parser_from_search(page)
     except:
         print traceback.format_exc()
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return weibo_list
     
     
     if len(weibo_list) == 0:
         if zero_aviable_check_validity(page):
             WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, sina does not have ! " + url)
             return weibo_list
         if weibo_guangchang_forbidden(page):
             WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, forbidden ! ! " + url)
             
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy ! " + url)
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return weibo_list
         
     else:
         if int(url[url.rfind('=') + 1:]) == 1:
             total_num = weibo_list[0].all_weibo_num
             self.put_second_and_more_url_queue(total_num, url)
         WeiboSearchLog().get_scheduler_logger().info(self.name + " crawl success! " + url)
         return weibo_list
Exemplo n.º 7
0
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)

        repost_list = []
        page = ""
        try:
            page = craw_object.get_page()
            repost_list = page_parser_from_search_for_repost(page)  # 解析页面,生成一条条的 repost
        except:
            print traceback.format_exc()
            crawl_repost.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
            crawl_repost.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return repost_list

        if len(repost_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            #             if no_one_commented(page):
            #                 WeiboSearchLog().get_scheduler_logger().info(self.name + " nobody commented !")
            #                 return repost_list;

            crawl_repost.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(self.name + " repost_list is 0 , change proxy !")
            crawl_repost.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return repost_list
        else:
            return repost_list
Exemplo n.º 8
0
    def crawl(self):
        # 实时且原创的微博
        self.data['advancedfilter'] = '1'
        self.data['keyword'] = self.keyword
        self.data['hasori'] = '1'
        #         self.data['nick'] =  ''
        #         self.data['starttime'] =  ''
        #         self.data['endtime'] =  ''
        self.data['sort'] = 'time'
        self.data['smblog'] = '搜索'
        url = 'http://weibo.cn/search/'

        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        weibo_list = []
        try:
            page = craw_object.get_page_with_form(self.data)
            weibo_list = page_parser_from_search(page)
        except:
            print traceback.format_exc()
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(
                self.name + " proxy exception , change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()

        if len(weibo_list) == 0:
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(
                self.name + " get nothing, change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()
        else:
            return weibo_list