예제 #1
0
def search_news(keyword=None, num=5):
    """
    return result of search news, a dict which contains :
    : title: news title
    : url : news url
    : source: news source like "sina"
    : images : image urls in the content
    : content: news content
    : time:  news publish time
    """
    if not keyword:
        return
    print "baidu keywords=", keyword
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    words = '+'.join(keyword)
    search_words = urllib.quote(str2utf8(words)[0])
    url = "%sword=%s" % (BAIDU_BASE_URL["news"], search_words)
    print "url=",url
    result = br.open(url).read()

    soup = BeautifulSoup(result)
    news_list = []
    for c in soup.findAll("li"):
        try:
            news = {}
            news["title"] = c.a.text.strip()
            news["url"] = c.a.get("href")
            news["images"] = [img.get("src") for img in c.findAll("img")]
            decode_content = str2utf8(c.span.text)[0].decode("utf-8")\
                .replace(" ", " ").replace(u"\xa0", " ").strip()

            temp = decode_content.rsplit(' ', 2)
            print "temp=", decode_content
            if len(temp) != 3:
                continue

            news["source"], day, time = temp
            news["time"] = day +" " + time
            news["content"] = c.find("div", {"class": "c-summary"}).text\
                .replace(u"- 百度快照", "").strip()
            news_list.append(news)
        except AttributeError, e:
            logger.warning(e)
            continue
        except Exception, e:
            logger.error(traceback.format_exc(e))
            continue
예제 #2
0
def get_msg(keyword=None, name=None):
    """
     return grab message and took some filter(如果新闻或微博内容数据库中木有,则返回;
        如果有且发布时间在MAX_LONG_TIME 以内,也返回;其他过滤掉)
    : keyword: key word list for searching news
    : name: weibo account
    """
    try:
        if not keyword and not name:
            return {}
        send_contents = {}
        news_list = search_news(keyword)

        send_contents["news"] = can_send_news(keyword, news_list)
        logger.debug("can send %d news after filtered" % len(send_contents["news"]))
        # if name:
        #     weibo = search_weibo(name)
        #     ret = can_send_weibo(name, weibo)
        #     if ret:
        #         send_contents["weibo"] = ret
        return send_contents
    except Exception, e:
        logger.error(traceback.format_exc(e))
        return {}