示例#1
0
def get_entry(href):
    link = "http://www.playno1.com/" + href
    q = pq(link, redirect=False)
    title = q("h1:first").text()
    published, author = q("h1:first ~ p").text().split(u" | \u4f5c\u8005:")
    content = q("#article_content").html()
    return {"link": link, "title": title, "author": author, "content": content, "published": published}
示例#2
0
def get_content(url):
    q = pq(url)
    q('a:contains("pic.twitter.com")').map(
        lambda i, x: pq(x.attrib['href'])(
            '.permalink-tweet-container img:not(.avatar)'
        ).map(lambda j, y: q(x).before(q('<img>').attr('src', y.attrib['src'])))
    )
    [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment
    q('script, .footer_social_ad, .button_top, .article_footer').remove()
    q('.ad_amazon, .jin-ads, #other_news_website').remove()
    q('#popular_articles_comment, #hot_tweet, #category-link').remove()
    q('.related-articles, #ad2, .ent_ad_md, #ad_rs, #tags').remove()
    q('.tooltip, .comment_form, .article_header').remove()
    q(q('.article_bodymore > table')[-2:]).remove()
    q(q('#comment_list li')[30:]).remove()
    q('#comment_list li dl').replace_with(lambda i, x: x[1].text)
    q('#comment_list ul')[0].tag = 'ol'
    q('img').wrap('<div style="float: left !important">')
    [strip(i) for i in q.root.iter()]
    content = q('.article').html() + q('#comment').html()
    content = re.sub(r'(<br/>)+', '<br/>', content)
    return content
示例#3
0
文件: ptt_feed.py 项目: ypcat/cowper
def get_entry(url, prefix='', title=''):
    q = pq(url, cookies=COOKIES)
    author = q('.article-meta-value:eq(0)').text()
    title = prefix + q('.article-meta-value:eq(2)').text() or title
    published = post_time(q('.article-meta-value:eq(3)').text())
    updated = push_time(q('.push-ipdatetime:last').text()) or published
    content = get_content(q('#main-content').html())
    return {
        'link': url,
        'title': title,
        'author': author,
        'content': content,
        'published': published,
        'updated': updated
    }
示例#4
0
文件: ptt_feed.py 项目: ypcat/cowper
def get_page(url):
    page = {}
    q = pq(url, cookies=COOKIES)
    q('.r-list-sep ~ .r-ent').remove()
    page['next_url'] = BASE_URL + q('a.wide:eq(1)').attr('href')
    page['posts'] = []
    for ent in q('.r-ent'):
        if not q('.f0,.f1,.f3', ent):
            continue
        if q('.f3', ent).text().startswith('1'): # take out push < 20
            continue
        if not q('a', ent):
            continue
        page['posts'].append({
            'link': BASE_URL + q('a', ent).attr('href'),
            'prefix': q('.nrec', ent).text() + ' ',
            'title': q('.title', ent).text(),
        })
    return page
示例#5
0
def get_entries(url):
    q = pq(url, redirect=False)
    for e in q(".fire_float"):
        href = q("a", e).attr("href")
        yield get_entry(href)
        time.sleep(3)