Exemplo n.º 1
0
    def parse(self, page):
        '''
        Parse the page, store weibos into DB and return {weibo id, weibo comment number}
        If the trees getting failed(Exception), stop consequent processing and raise AdvKeywordWeiboPageParseException

        Return [crawl_feed_count, new_feed_count, {mid:n_comment}]
        '''
        url_wrapper = self.url_wrapper

        search_url = url_wrapper.to_url()
        weibo_type = url_wrapper.get_url_type()
        page_num = url_wrapper.page_num

        weibo_comment_infor = {}
        crawl_feed_count = 0
        new_feed_count = 0

        increment = 0
        increment_mark = True
        last_mid = '0'
        last_mid_mark = True

        weibo_trees = []
        try:
            weibo_trees = page_2_weibo_trees_adv(page)
        except AdvKeywordWeiboPageParseException as err:
            #for exception hanlde, if the weibo_trees parse failed, this page process failed
            raise err

        if not weibo_trees:
            return [crawl_feed_count, new_feed_count, weibo_comment_infor]

        for weibo_tree in weibo_trees:
            weibo = None
            crawl_feed_count += 1
            try:
                weibo = weibo_tree_2_weibo_adv(self.url_wrapper.keyword, weibo_tree, page_num, weibo_type)

            #need to log_type
            except AdvKeywordWeiboPageParseException as err:
                my_log.write_log(log_type=weibo_type,
                                 operation_status=0,
                                 fail_code=err.get_error_code(),
                                 err_msg=search_url)
            if not weibo:
                continue

            storer = AdvKeywordHotWeiboStorer(weibo)
            weibo_comment_infor[weibo.mid] = weibo.n_comment

            if storer.store() is True:
                new_feed_count += 1

            if last_mid_mark:
                last_mid = weibo.mid
                last_mid_mark = False

            if weibo.mid == self.url_wrapper.last_mid:
                increment_mark = False

            if increment_mark:
                increment += 1

        if increment < new_feed_count:
            increment = new_feed_count

        self.url_wrapper.last_mid = last_mid

        return [crawl_feed_count, new_feed_count, increment, weibo_comment_infor]
    def parse(self, page):
        '''
        Parse the page, store weibos into DB and return {weibo id, weibo comment number}
        If the trees getting failed(Exception), stop consequent processing and raise AdvKeywordWeiboPageParseException

        Return [crawl_feed_count, new_feed_count, {mid:n_comment}]
        '''
        
        url_wrapper = self.url_wrapper
        
        search_url = url_wrapper.to_url()
        weibo_type = url_wrapper.get_url_type()
        page_num = url_wrapper.page_num
        
        weibo_comment_infor = {}
        crawl_feed_count = 0
        new_feed_count = 0

        weibo_trees = []
        try:
            weibo_trees = page_2_weibo_trees_adv(page)
            
#             print "weibo_trees's size is: " + str(len(weibo_trees))
        except AdvKeywordWeiboPageParseException as err:
            #for exception hanlde, if the weibo_trees parse failed, this page process failed
            s = traceback.format_exc()
            
            scheduler_logger.error(s)
            scheduler_logger.error(self.url_wrapper.tostring() + "\t" + self.url_wrapper.to_url())
            
            raise err

        if not weibo_trees:
            print 'no weibo_trees' 
            return [crawl_feed_count, new_feed_count, weibo_comment_infor]

        increment = 0
        increment_mark = True
        last_mid = '0'
        last_mid_mark = True
        
        for weibo_tree in weibo_trees:
            weibo = None
            crawl_feed_count += 1
            try:
                weibo = weibo_tree_2_weibo_adv(self.url_wrapper.keyword, weibo_tree, page_num, weibo_type)
            
            #need to log_type
            except AdvKeywordWeiboPageParseException as err:
                my_log.write_log(log_type=weibo_type, 
                                 operation_status=0,
                                 fail_code=err.get_error_code(),
                                 err_msg=search_url)
            except AttributeError:
                continue
                # except:
                # s = traceback.format_exc()
            #                 print s
                
            if not weibo:
#                 print "continue"
                continue

            storer = AdvKeywordRealWeiboStorer(weibo)
            weibo_comment_infor[weibo.mid] = weibo.n_comment
            
            if storer.store() is True:
                new_feed_count += 1
#             else:
#                 print weibo.create_time
#                 print weibo.mid
#                 print weibo.content
                
            if last_mid_mark:
                last_mid = weibo.mid
                last_mid_mark = False
                
            if weibo.mid == self.url_wrapper.last_mid:
                increment_mark = False
                
            if increment_mark:
                increment += 1
        
        if increment < new_feed_count:
            increment = new_feed_count
            
        self.url_wrapper.last_mid = last_mid
        
        return [crawl_feed_count, new_feed_count, increment, weibo_comment_infor]
Exemplo n.º 3
0
    def parse(self, page):
        '''
        Parse the page, store weibos into DB and return {weibo id, weibo comment number}
        If the trees getting failed(Exception), stop consequent processing and raise AdvKeywordWeiboPageParseException

        Return [crawl_feed_count, new_feed_count, {mid:n_comment}]
        '''

        url_wrapper = self.url_wrapper

        search_url = url_wrapper.to_url()
        weibo_type = url_wrapper.get_url_type()
        page_num = url_wrapper.page_num

        weibo_comment_infor = {}
        crawl_feed_count = 0
        new_feed_count = 0

        weibo_trees = []
        try:
            weibo_trees = page_2_weibo_trees_adv(page)

#             print "weibo_trees's size is: " + str(len(weibo_trees))
        except AdvKeywordWeiboPageParseException as err:
            #for exception hanlde, if the weibo_trees parse failed, this page process failed
            s = traceback.format_exc()

            scheduler_logger.error(s)
            scheduler_logger.error(self.url_wrapper.tostring() + "\t" +
                                   self.url_wrapper.to_url())

            raise err

        if not weibo_trees:
            print 'no weibo_trees'
            return [crawl_feed_count, new_feed_count, weibo_comment_infor]

        increment = 0
        increment_mark = True
        last_mid = '0'
        last_mid_mark = True

        for weibo_tree in weibo_trees:
            weibo = None
            crawl_feed_count += 1
            try:
                weibo = weibo_tree_2_weibo_adv(self.url_wrapper.keyword,
                                               weibo_tree, page_num,
                                               weibo_type)

            #need to log_type
            except AdvKeywordWeiboPageParseException as err:
                my_log.write_log(log_type=weibo_type,
                                 operation_status=0,
                                 fail_code=err.get_error_code(),
                                 err_msg=search_url)
            except AttributeError:
                continue
                # except:
                # s = traceback.format_exc()
            #                 print s

            if not weibo:
                #                 print "continue"
                continue

            storer = AdvKeywordRealWeiboStorer(weibo)
            weibo_comment_infor[weibo.mid] = weibo.n_comment

            if storer.store() is True:
                new_feed_count += 1
#             else:
#                 print weibo.create_time
#                 print weibo.mid
#                 print weibo.content

            if last_mid_mark:
                last_mid = weibo.mid
                last_mid_mark = False

            if weibo.mid == self.url_wrapper.last_mid:
                increment_mark = False

            if increment_mark:
                increment += 1

        if increment < new_feed_count:
            increment = new_feed_count

        self.url_wrapper.last_mid = last_mid

        return [
            crawl_feed_count, new_feed_count, increment, weibo_comment_infor
        ]