示例#1
0
class FullRssParser:
    
    def __init__(self, url, etag=None, last_modified=None, proxy=None, proxies=None, callback=None, check_baseurl=2, multithread=False, debug=False):
        self.baseurl = None
        self.mario = MarioRss(callback=self.rss_parser)
        self.callback = callback
        self.check_baseurl = check_baseurl
        self.multithread = multithread
        if proxies: mario.set_proxies_list(proxies)
        self.rss_response = self.mario.get(starturl=None, rssurl=url, etag=etag, last_modified=last_modified, proxy=proxy, multithread=self.multithread)
        self.debug = debug
    
    def rss_parser(self, response):
        if not response or not response.body: return None
        try:
            tree = BeautifulSoup(response.body, fromEncoding='utf-8')
        except Exception, err:
            return None
        feed = self.matched_feed(response)
        if not feed: return None
        blocks = get_textblocks(tree.body)
        if not blocks: return None
        res_blocks = []
        try:
            feed_content = striptags(feed['summary'])
        except KeyError:
            feed_content = striptags(feed['content'][0]['value'])
        for b in blocks:
            dist = levenshtein_distance(feed_content, striptags(b.orig_text))
            #dist = len(lcs(striptags(feed['summary']), b.orig_text))
            res_blocks.append((dist, b))
        res_blocks = sorted(res_blocks, key=operator.itemgetter(0))
        block = res_blocks[0][1]
        blocks = None
        lp = LayoutParser(None, None)
        elements = lp.get_elements(block.path)
        element = lp.find_element(tree.body, elements, tree.body, False)
        if element: 
            #return element.renderContents().strip()
            content = striptags(element.renderContents())
            if len(content) < len(feed_content):
                content = feed_content
        else:
            content = feed_content
        url = response.effective_url.decode('utf-8')
        if isinstance(content, str): content = content.decode('utf-8')
        try:
            author = feed.author.decode('utf-8')
        except:
            author = u''
        if self.callback:
            if self.check_baseurl == 1 and not self.baseurl or self.check_baseurl !=None:
                self.baseurl = guess_baseurl(url, tree)
            else:
                self.baseurl = None
            if feed.has_key('updated_parsed'): updated_parsed = feed['updated_parsed']
            else: updated_parsed = None
            self.callback({'url':url, 'title':striptags(feed['title']), 'content':content, 'updated_parsed':updated_parsed, 'author':author, 'baseurl':self.baseurl, 'etag':response.etag, 'last_modified':response.last_modified})
示例#2
0
 def __init__(self, url, etag=None, last_modified=None, proxy=None, proxies=None, callback=None, check_baseurl=2, multithread=False, debug=False):
     self.baseurl = None
     self.mario = MarioRss(callback=self.rss_parser)
     self.callback = callback
     self.check_baseurl = check_baseurl
     self.multithread = multithread
     if proxies: mario.set_proxies_list(proxies)
     self.rss_response = self.mario.get(starturl=None, rssurl=url, etag=etag, last_modified=last_modified, proxy=proxy, multithread=self.multithread)
     self.debug = debug
示例#3
0
 def run(self, rssurl=None, rssbody=None, concount=CONCOUNT):
     mario = MarioRss(callback=self.callback, callpre=self.callpre, callfail=self.callfail, check_duplicate=True, concount=concount, multithread=True)
     rss = mario.get(self.starturl, rssurl, rssbody)
     self.dump(mario, rss)
示例#4
0
 def runRss(self, rssurl=None, rssBody=None, concount=CONCOUNT):
     if self.analysis: limit = 10
     else: limit = None
     mario = MarioRss(callback=self.callback, callpre=self.callpre, callfail=self.callfail, concount=concount)
     rss = mario.get(self.starturl, rssurl, rssBody, limit)
     self.dump(mario, rss)