def get_summary(self, max_num=100): """ @attention: 通过内容获取摘要 """ from common import utils return utils.get_summary_from_html_by_sub(self.content, max_num=max_num)
def auto_publish_article(key): from common.utils import get_summary_from_html_by_sub # text = open((u"./txt/%s.txt" % u"【原创】《大雄股市历险记4》股票为什么会上涨").encode("utf8"), "r").read() # text = get_summary_from_html_by_sub(text, max_num=990000) # print text # return count = 0 hrefs = u"" for index in range(20): url = 'http://weixin.sogou.com/weixin?query=' + key + '&type=2&ie=utf8&page=' + str( index) + '&p=40040100&dp=1&w=01019900&dr=1' rep = requests.get(url) text = rep.text jq = pq(text) lst_articles = jq('.wx-rb3 .txt-box a') for article in lst_articles: print count href = article.get("href") name = article.text_content() text = requests.get(href).text text = get_summary_from_html_by_sub(text, max_num=990000, filter_nbsp=True) re_blank = re.compile('[\s]+', re.I) key_text = re_blank.sub('', text) if not 300 < len(key_text) < 3000: continue if is_in_baidu(key_text[50:60]): continue if is_in_baidu(key_text[150:160]): continue if is_in_baidu(key_text[250:260]): continue count += 1 hrefs += u"%s\n" % href open((u"./txt/%s.txt" % name).encode("utf8"), "w").write(text.encode("utf8")) # break open((u"./txt/hrefs.txt").encode("utf8"), "w").write(hrefs.encode("utf8")) print u"total articles:%s" % count
def auto_publish_article(key): from common.utils import get_summary_from_html_by_sub # text = open((u"./txt/%s.txt" % u"【原创】《大雄股市历险记4》股票为什么会上涨").encode("utf8"), "r").read() # text = get_summary_from_html_by_sub(text, max_num=990000) # print text # return count = 0 hrefs = u"" for index in range(20): url = 'http://weixin.sogou.com/weixin?query=' + key + '&type=2&ie=utf8&page=' + str(index) + '&p=40040100&dp=1&w=01019900&dr=1' rep = requests.get(url) text = rep.text jq = pq(text) lst_articles = jq('.wx-rb3 .txt-box a') for article in lst_articles: print count href = article.get("href") name = article.text_content() text = requests.get(href).text text = get_summary_from_html_by_sub(text, max_num=990000, filter_nbsp=True) re_blank = re.compile('[\s]+', re.I) key_text = re_blank.sub('', text) if not 300 < len(key_text) < 3000: continue if is_in_baidu(key_text[50:60]): continue if is_in_baidu(key_text[150:160]): continue if is_in_baidu(key_text[250:260]): continue count += 1 hrefs += u"%s\n" % href open((u"./txt/%s.txt" % name).encode("utf8"), "w").write(text.encode("utf8")) # break open((u"./txt/hrefs.txt").encode("utf8"), "w").write(hrefs.encode("utf8")) print u"total articles:%s" % count
def get_summary(self): """ @attention: 通过内容获取摘要 """ from common import utils return utils.get_summary_from_html_by_sub(self.content)
def get_summary(self): """ @note: 通过内容获取摘要 """ from common import utils return utils.get_summary_from_html_by_sub(self.des)
def get_summary(self): """ @attention: 通过内容获取摘要 """ from common import utils return utils.get_summary_from_html_by_sub(self.summary)