def get_content(url): r = get(url) q = pyquery.PyQuery(r.text.encode('EUC-JP', 'ignore')) [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment q('script, .posted, .amazlet-box, .poweredAdsBy, .menu').remove() q('.blogbody div, span, br').each(lambda i, e: e.attrib.clear()) [strip(i) for i in q.root.iter()] content = q('.blogbody').html() content = re.sub(r'(<br/>)+', '<br/>', content) return content
def get_content(url): q = pq(url) q('a:contains("pic.twitter.com")').map( lambda i, x: pq(x.attrib['href'])( '.permalink-tweet-container img:not(.avatar)' ).map(lambda j, y: q(x).before(q('<img>').attr('src', y.attrib['src']))) ) [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment q('script, .footer_social_ad, .button_top, .article_footer').remove() q('.ad_amazon, .jin-ads, #other_news_website').remove() q('#popular_articles_comment, #hot_tweet, #category-link').remove() q('.related-articles, #ad2, .ent_ad_md, #ad_rs, #tags').remove() q('.tooltip, .comment_form, .article_header').remove() q(q('.article_bodymore > table')[-2:]).remove() q(q('#comment_list li')[30:]).remove() q('#comment_list li dl').replace_with(lambda i, x: x[1].text) q('#comment_list ul')[0].tag = 'ol' q('img').wrap('<div style="float: left !important">') [strip(i) for i in q.root.iter()] content = q('.article').html() + q('#comment').html() content = re.sub(r'(<br/>)+', '<br/>', content) return content