def news_analyze(cls, query_body): """ 单篇新闻分析,按url精准取新闻内容,从mysql里取,其次从网络抓取 :param query_body: :return: """ news_content = "" news_url = query_body["news_url"] news_src = query_body["news_src"] src_dict = { "新浪": "sina_news.sina_mid", "网易": "wangyi_news.wangyi_mid", "腾讯": "qq_news.qq_mid", "搜狐": "souhu_news.souhu_mid", "新华": "xinhua_new.xinhua_mid" } try: sql_statement = "select * from %s where news_link='%s';" % ( src_dict[news_src], news_url) conn = MysqlHelper.create_conn() cur = conn.cursor() cur.execute(sql_statement) mysql_res = cur.fetchall() if len(mysql_res) > 0: news_content = mysql_res[0][3] else: # 数据库搜不到,从网络爬,用新华网的session来抓 news_content = Html2Article.url2article(news_url) except Exception, e: # 数据库搜不到,从网络爬,用新华网的session来抓 news_content = Html2Article.url2article(news_url)
# -*- coding:utf-8 -*- import sys import requests import pymysql from elasticsearch import Elasticsearch from model.db_operate.mysql_helper import MysqlHelper reload(sys) sys.setdefaultencoding('utf-8') # 从mysql数据库全量导入到elastcisearch conn = MysqlHelper.create_conn() cur = conn.cursor() es = Elasticsearch() db_table_list = [ "qq_news.qq_mid", "sina_news.sina_mid", "souhu_news.souhu_mid", "wangyi_news.wangyi_mid", "xinhua_news.xinhua_mid" ] for db_table in db_table_list: sql_statement = "select * from %s;" % db_table cur.execute(sql_statement) res = cur.fetchall() for news in res: try: title = news[1] pub_time = str(news[2]).split()[0] news_content = news[3] src = news[4] url = news[5]