Exemplo n.º 1
0
def get_content(url):
    r = get(url)
    q = pyquery.PyQuery(r.text.encode('EUC-JP', 'ignore'))
    [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment
    q('script, .posted, .amazlet-box, .poweredAdsBy, .menu').remove()
    q('.blogbody div, span, br').each(lambda i, e: e.attrib.clear())
    [strip(i) for i in q.root.iter()]
    content = q('.blogbody').html()
    content = re.sub(r'(<br/>)+', '<br/>', content)
    return content
Exemplo n.º 2
0
def feedlist():
    _feedlist = []
    ret = feedparser.parse(url)
    print ret['feed']['title'],ret['feed']['link'],ret['feed']['description']
    LOG.info("rss items count:%s" % len(ret.entries))
    for _item in ret.entries:
        #print _item.title,_item.link,_item.description
        #dt = datetime.fromtimestamp(mktime(_item.published_parsed))
        #print _item.published_parsed,dt
        _ret = feed.get(link=_item.link)
        LOG.info("feed.get result:%s for link:%s" % (_ret,_item.link))
        if _ret is None:
            _feed = feed.Feed()
            _feed.title = _item.title
            _feed.link = _item.link
            _feed.desc = _item.description
            _feed.publish_at = datetime.fromtimestamp(mktime(_item.published_parsed))
            feed.save(_feed)
            _feedlist.append(_feed)
    return _feedlist