예제 #1
0
    def crawl(self, url, is_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        uid_or_uname = ""
        try:
            page = craw_object.get_page()

            uid_or_uname = page_parser_from_search_for_uid(page)
        except:
            print traceback.format_exc()
            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return uid_or_uname
        return uid_or_uname
예제 #2
0
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        comment_list = []
        page = ""
        try:
            page = craw_object.get_page()
            comment_list = page_parser_from_search_for_comment(
                page)  # 解析页面,生成一条条的 comment
        except:
            print traceback.format_exc()
            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list

        if len(comment_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            if no_one_commented(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " nobody commented !")
                return comment_list

            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " comment_list is 0 , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list
        else:
            return comment_list
예제 #3
0
    def crawl(self, uid_or_nickname, is_again=False):

        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        url = ''
        #         if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
        #             len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
        #             WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname)
        #             return "nothing"

        quote_uid_or_nickname = ""
        try:
            quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip()))
        except:
            print traceback.format_exc()
            print uid_or_nickname

        url = "http://weibo.cn/" + uid_or_nickname + "/info"

        #         if quote_uid_or_nickname == uid_or_nickname:
        #             url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
        #         else:
        #             url = "http://weibo.cn/n/" + quote_uid_or_nickname

        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()

        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        user_info = ""
        try:
            page = craw_object.get_page()

            user_info = page_parser_from_search_for_UserInfo(page, url)
        except:
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                return user_info

        return user_info