class FullRssParser: def __init__(self, url, etag=None, last_modified=None, proxy=None, proxies=None, callback=None, check_baseurl=2, multithread=False, debug=False): self.baseurl = None self.mario = MarioRss(callback=self.rss_parser) self.callback = callback self.check_baseurl = check_baseurl self.multithread = multithread if proxies: mario.set_proxies_list(proxies) self.rss_response = self.mario.get(starturl=None, rssurl=url, etag=etag, last_modified=last_modified, proxy=proxy, multithread=self.multithread) self.debug = debug def rss_parser(self, response): if not response or not response.body: return None try: tree = BeautifulSoup(response.body, fromEncoding='utf-8') except Exception, err: return None feed = self.matched_feed(response) if not feed: return None blocks = get_textblocks(tree.body) if not blocks: return None res_blocks = [] try: feed_content = striptags(feed['summary']) except KeyError: feed_content = striptags(feed['content'][0]['value']) for b in blocks: dist = levenshtein_distance(feed_content, striptags(b.orig_text)) #dist = len(lcs(striptags(feed['summary']), b.orig_text)) res_blocks.append((dist, b)) res_blocks = sorted(res_blocks, key=operator.itemgetter(0)) block = res_blocks[0][1] blocks = None lp = LayoutParser(None, None) elements = lp.get_elements(block.path) element = lp.find_element(tree.body, elements, tree.body, False) if element: #return element.renderContents().strip() content = striptags(element.renderContents()) if len(content) < len(feed_content): content = feed_content else: content = feed_content url = response.effective_url.decode('utf-8') if isinstance(content, str): content = content.decode('utf-8') try: author = feed.author.decode('utf-8') except: author = u'' if self.callback: if self.check_baseurl == 1 and not self.baseurl or self.check_baseurl !=None: self.baseurl = guess_baseurl(url, tree) else: self.baseurl = None if feed.has_key('updated_parsed'): updated_parsed = feed['updated_parsed'] else: updated_parsed = None self.callback({'url':url, 'title':striptags(feed['title']), 'content':content, 'updated_parsed':updated_parsed, 'author':author, 'baseurl':self.baseurl, 'etag':response.etag, 'last_modified':response.last_modified})
def __init__(self, url, etag=None, last_modified=None, proxy=None, proxies=None, callback=None, check_baseurl=2, multithread=False, debug=False): self.baseurl = None self.mario = MarioRss(callback=self.rss_parser) self.callback = callback self.check_baseurl = check_baseurl self.multithread = multithread if proxies: mario.set_proxies_list(proxies) self.rss_response = self.mario.get(starturl=None, rssurl=url, etag=etag, last_modified=last_modified, proxy=proxy, multithread=self.multithread) self.debug = debug
def run(self, rssurl=None, rssbody=None, concount=CONCOUNT): mario = MarioRss(callback=self.callback, callpre=self.callpre, callfail=self.callfail, check_duplicate=True, concount=concount, multithread=True) rss = mario.get(self.starturl, rssurl, rssbody) self.dump(mario, rss)
def runRss(self, rssurl=None, rssBody=None, concount=CONCOUNT): if self.analysis: limit = 10 else: limit = None mario = MarioRss(callback=self.callback, callpre=self.callpre, callfail=self.callfail, concount=concount) rss = mario.get(self.starturl, rssurl, rssBody, limit) self.dump(mario, rss)