Пример #1
0
def filter_data(news_df):
    """过滤数据"""
    df = preprocessing.data_filter(news_df)
    now_time = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
    # now_time = '2018-04-06 23:59'
    df = preprocessing.get_data(df, last_time=now_time, delta=5)
    return df
def crawler():
    sina_top_n = Entry_Sina.get()
    sohu_top_n = Entry_Sohu.get()
    xinhuanet_top_n = Entry_XinhuaNet.get()
    sina_top_n = 0 if sina_top_n == '' else int(sina_top_n)
    sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n)
    xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n)
    sina_top_n = 0 if sina_top_n <= 0 else sina_top_n
    sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n
    xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n
    if sina_top_n + sohu_top_n + xinhuanet_top_n == 0:
        messagebox.showinfo('Message', '新闻数量不能全部为非正数!')
        return
    news_df_file_path = os.path.join(news_path, 'news_df.csv')
    threaded_crawler(sina_top_n,
                     sohu_top_n,
                     xinhuanet_top_n,
                     save_file_path=news_df_file_path)
    news_df = load_news(news_df_file_path)
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df,
                          os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
    messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
Пример #3
0
def select_news():
    filename = filedialog.askopenfilename(filetypes=[("csv file", "*.csv")])
    if len(filename) == 0:
        return
    news_df = news_pandas.load_news(filename)
    news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv'))
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
def crawler():
    sina_top_n = Entry_Sina.get()
    sohu_top_n = Entry_Sohu.get()
    xinhuanet_top_n = Entry_XinhuaNet.get()
    sina_top_n = 0 if sina_top_n == '' else int(sina_top_n)
    sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n)
    xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n)
    sina_top_n = 0 if sina_top_n <= 0 else sina_top_n
    sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n
    xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n
    if sina_top_n + sohu_top_n + xinhuanet_top_n == 0:
        messagebox.showinfo('Message', '新闻数量不能全部为非正数!')
        return
    news_crawler.threaded_crawler(sina_top_n, sohu_top_n, xinhuanet_top_n)
    sina_news_df = pd.DataFrame()
    sohu_news_df = pd.DataFrame()
    xinhuanet_news_df = pd.DataFrame()
    if sina_top_n > 0:
        sina_news_df = news_pandas.load_news(
            os.path.join(news_path, 'sina_latest_news.csv'))
    if sohu_top_n > 0:
        sohu_news_df = news_pandas.load_news(
            os.path.join(news_path, 'sohu_latest_news.csv'))
    if xinhuanet_top_n > 0:
        xinhuanet_news_df = news_pandas.load_news(
            os.path.join(news_path, 'xinhuanet_latest_news.csv'))
    news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df],
                        ignore_index=True)
    news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv'))
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df,
                          os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
    messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
Пример #5
0
# -*- coding: utf-8 -*-

import os
import pandas as pd
from datetime import datetime
from utils import news_crawler
from utils import preprocessing

project_path = os.path.dirname(os.path.realpath(__file__))  # 获取项目路径
news_path = os.path.join(project_path, 'news')  # 新闻数据存放目录路径
if not os.path.exists(news_path):  # 创建news文件夹
    os.mkdir(news_path)

sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True)
sohu_news_df = news_crawler.get_latest_news('sohu', top=10, show_content=True)
xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=10, show_content=True)

news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv'))
news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv'))
news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv'))

news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True)
news_df = preprocessing.data_filter(news_df)
# last_time = datetime.today().strftime('%Y-%m-%d %H:%M')  # format like '2018-04-06 23:59'
# news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5)
news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x))

news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))