def get_content(url): r = get(url) q = pyquery.PyQuery(r.text.encode('EUC-JP', 'ignore')) [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment q('script, .posted, .amazlet-box, .poweredAdsBy, .menu').remove() q('.blogbody div, span, br').each(lambda i, e: e.attrib.clear()) [strip(i) for i in q.root.iter()] content = q('.blogbody').html() content = re.sub(r'(<br/>)+', '<br/>', content) return content
def feedlist(): _feedlist = [] ret = feedparser.parse(url) print ret['feed']['title'],ret['feed']['link'],ret['feed']['description'] LOG.info("rss items count:%s" % len(ret.entries)) for _item in ret.entries: #print _item.title,_item.link,_item.description #dt = datetime.fromtimestamp(mktime(_item.published_parsed)) #print _item.published_parsed,dt _ret = feed.get(link=_item.link) LOG.info("feed.get result:%s for link:%s" % (_ret,_item.link)) if _ret is None: _feed = feed.Feed() _feed.title = _item.title _feed.link = _item.link _feed.desc = _item.description _feed.publish_at = datetime.fromtimestamp(mktime(_item.published_parsed)) feed.save(_feed) _feedlist.append(_feed) return _feedlist