def filter_data(news_df): """过滤数据""" df = preprocessing.data_filter(news_df) now_time = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') # now_time = '2018-04-06 23:59' df = preprocessing.get_data(df, last_time=now_time, delta=5) return df
def crawler(): sina_top_n = Entry_Sina.get() sohu_top_n = Entry_Sohu.get() xinhuanet_top_n = Entry_XinhuaNet.get() sina_top_n = 0 if sina_top_n == '' else int(sina_top_n) sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n) xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n) sina_top_n = 0 if sina_top_n <= 0 else sina_top_n sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n if sina_top_n + sohu_top_n + xinhuanet_top_n == 0: messagebox.showinfo('Message', '新闻数量不能全部为非正数!') return news_df_file_path = os.path.join(news_path, 'news_df.csv') threaded_crawler(sina_top_n, sohu_top_n, xinhuanet_top_n, save_file_path=news_df_file_path) news_df = load_news(news_df_file_path) global filter_df filter_df = preprocessing.data_filter(news_df) news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv')) news_num = filter_df.shape[0] sum_top_n.set(news_num) messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
def select_news(): filename = filedialog.askopenfilename(filetypes=[("csv file", "*.csv")]) if len(filename) == 0: return news_df = news_pandas.load_news(filename) news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv')) global filter_df filter_df = preprocessing.data_filter(news_df) news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv')) news_num = filter_df.shape[0] sum_top_n.set(news_num)
def crawler(): sina_top_n = Entry_Sina.get() sohu_top_n = Entry_Sohu.get() xinhuanet_top_n = Entry_XinhuaNet.get() sina_top_n = 0 if sina_top_n == '' else int(sina_top_n) sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n) xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n) sina_top_n = 0 if sina_top_n <= 0 else sina_top_n sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n if sina_top_n + sohu_top_n + xinhuanet_top_n == 0: messagebox.showinfo('Message', '新闻数量不能全部为非正数!') return news_crawler.threaded_crawler(sina_top_n, sohu_top_n, xinhuanet_top_n) sina_news_df = pd.DataFrame() sohu_news_df = pd.DataFrame() xinhuanet_news_df = pd.DataFrame() if sina_top_n > 0: sina_news_df = news_pandas.load_news( os.path.join(news_path, 'sina_latest_news.csv')) if sohu_top_n > 0: sohu_news_df = news_pandas.load_news( os.path.join(news_path, 'sohu_latest_news.csv')) if xinhuanet_top_n > 0: xinhuanet_news_df = news_pandas.load_news( os.path.join(news_path, 'xinhuanet_latest_news.csv')) news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv')) global filter_df filter_df = preprocessing.data_filter(news_df) news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv')) news_num = filter_df.shape[0] sum_top_n.set(news_num) messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
# -*- coding: utf-8 -*- import os import pandas as pd from datetime import datetime from utils import news_crawler from utils import preprocessing project_path = os.path.dirname(os.path.realpath(__file__)) # 获取项目路径 news_path = os.path.join(project_path, 'news') # 新闻数据存放目录路径 if not os.path.exists(news_path): # 创建news文件夹 os.mkdir(news_path) sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True) sohu_news_df = news_crawler.get_latest_news('sohu', top=10, show_content=True) xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=10, show_content=True) news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv')) news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv')) news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv')) news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) news_df = preprocessing.data_filter(news_df) # last_time = datetime.today().strftime('%Y-%m-%d %H:%M') # format like '2018-04-06 23:59' # news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5) news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x)) news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))