Пример #1
0
def get_content(url):
    r = get(url)
    q = pyquery.PyQuery(r.text.encode('EUC-JP', 'ignore'))
    [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment
    q('script, .posted, .amazlet-box, .poweredAdsBy, .menu').remove()
    q('.blogbody div, span, br').each(lambda i, e: e.attrib.clear())
    [strip(i) for i in q.root.iter()]
    content = q('.blogbody').html()
    content = re.sub(r'(<br/>)+', '<br/>', content)
    return content
Пример #2
0
def get_content(url):
    q = pq(url)
    q('a:contains("pic.twitter.com")').map(
        lambda i, x: pq(x.attrib['href'])(
            '.permalink-tweet-container img:not(.avatar)'
        ).map(lambda j, y: q(x).before(q('<img>').attr('src', y.attrib['src'])))
    )
    [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment
    q('script, .footer_social_ad, .button_top, .article_footer').remove()
    q('.ad_amazon, .jin-ads, #other_news_website').remove()
    q('#popular_articles_comment, #hot_tweet, #category-link').remove()
    q('.related-articles, #ad2, .ent_ad_md, #ad_rs, #tags').remove()
    q('.tooltip, .comment_form, .article_header').remove()
    q(q('.article_bodymore > table')[-2:]).remove()
    q(q('#comment_list li')[30:]).remove()
    q('#comment_list li dl').replace_with(lambda i, x: x[1].text)
    q('#comment_list ul')[0].tag = 'ol'
    q('img').wrap('<div style="float: left !important">')
    [strip(i) for i in q.root.iter()]
    content = q('.article').html() + q('#comment').html()
    content = re.sub(r'(<br/>)+', '<br/>', content)
    return content