def db_prepare(cls): """ 为新浪新闻建库建表,暂时考虑2018年 :return: """ MysqlHelper.create_database('sina_news') MysqlHelper.create_news_table_mid('sina_news', 'sina_mid')
def save_one_news(cls, url, db_name, table_name): news_info = WangyiNewsExtractor.news_info_by_url(url) if news_info: MysqlHelper.insert_news_mid(db_name=db_name, table_name=table_name, news_title=news_info["title"], pub_time=news_info["time"], content=news_info["content"], news_src="网易", news_link=news_info["url"], category=news_info["category"], topic='', summary='', polarity=news_info["polarity"])
def save_old_news_day(cls, date): news_list = SinaNewsBrief.fetch_news_by_day(date) print 'news_list_get' count = 0 for news_block in news_list: try: news_url = news_block['url'] news_content = Html2Article.url2article(news_url) page_content = requests.get(news_url).content news_keyword = SinaNewsExtract.extract_keywords(page_content) polarity = SentiCalc.score_calc(news_block["title"]) MysqlHelper.insert_news_mid('sina_news', 'sina_mid', news_block['title'], news_block['time'], news_content, '新浪', news_block['url'], news_block['category'], news_keyword, '', polarity=str(polarity)) print count, polarity, news_block['title'] count += 1 except Exception, e: print e
def news_analyze(cls, query_body): """ 单篇新闻分析,按url精准取新闻内容,从mysql里取,其次从网络抓取 :param query_body: :return: """ news_content = "" news_url = query_body["news_url"] news_src = query_body["news_src"] src_dict = { "新浪": "sina_news.sina_mid", "网易": "wangyi_news.wangyi_mid", "腾讯": "qq_news.qq_mid", "搜狐": "souhu_news.souhu_mid", "新华": "xinhua_new.xinhua_mid" } try: sql_statement = "select * from %s where news_link='%s';" % ( src_dict[news_src], news_url) conn = MysqlHelper.create_conn() cur = conn.cursor() cur.execute(sql_statement) mysql_res = cur.fetchall() if len(mysql_res) > 0: news_content = mysql_res[0][3] else: # 数据库搜不到,从网络爬,用新华网的session来抓 news_content = Html2Article.url2article(news_url) except Exception, e: # 数据库搜不到,从网络爬,用新华网的session来抓 news_content = Html2Article.url2article(news_url)
# -*- coding:utf-8 -*- import sys import requests import pymysql from elasticsearch import Elasticsearch from model.db_operate.mysql_helper import MysqlHelper reload(sys) sys.setdefaultencoding('utf-8') # 从mysql数据库全量导入到elastcisearch conn = MysqlHelper.create_conn() cur = conn.cursor() es = Elasticsearch() db_table_list = [ "qq_news.qq_mid", "sina_news.sina_mid", "souhu_news.souhu_mid", "wangyi_news.wangyi_mid", "xinhua_news.xinhua_mid" ] for db_table in db_table_list: sql_statement = "select * from %s;" % db_table cur.execute(sql_statement) res = cur.fetchall() for news in res: try: title = news[1] pub_time = str(news[2]).split()[0] news_content = news[3] src = news[4] url = news[5]
# -*- coding:utf-8 -*- import sys from model.db_operate.mysql_helper import MysqlHelper import pymysql reload(sys) sys.setdefaultencoding('utf-8') # create db and tables if __name__ == "__main__": MysqlHelper.create_database('souhu_news') MysqlHelper.create_news_table_mid('souhu_news', 'souhu_mid') MysqlHelper.create_database('qq_news') MysqlHelper.create_news_table_mid('qq_news', 'qq_mid') MysqlHelper.create_database('wangyi_news') MysqlHelper.create_news_table_mid('wangyi_news', 'wangyi_mid') MysqlHelper.create_database('xinhua_news') MysqlHelper.create_news_table_mid('xinhua_news', 'xinhua_mid') MysqlHelper.create_database('sina_news') MysqlHelper.create_news_table_mid('sina_news', 'sina_mid') print "db and table created!"