def preprocess():
    if filter_df0.shape[0] == 0:
        messagebox.showinfo('Message', '未选择新闻数据!')
        return
    df = filter_df0.copy()
    df['title_'] = df['title'].map(
        lambda x: preprocessing.clean_title_blank(x))
    df['content_'] = df['content'].map(
        lambda x: preprocessing.clean_content(x))
    df['content_'] = df['content_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df['content_cut'] = df['content_'].map(lambda x: preprocessing.pseg_cut(
        x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', 'v.*', 'eng', 't', 's', 'j', 'l', 'i']))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df['content_'] = df['content_cut'].map(lambda x: ' '.join(x))
    news_pandas.save_news(df, os.path.join(temp_news_path, 'news_cut.csv'))
    messagebox.showinfo('Message', '数据预处理完成!')
예제 #2
0
def content_preprocess(df_content):
    """新闻内容分词处理"""
    df_content['content_'] = df_content['content'].map(
        lambda x: preprocessing.clean_content(x))
    df_content['content_'] = df_content['content_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df_content['content_cut'] = df_content['content_'].map(
        lambda x: preprocessing.pseg_cut(
            x,
            userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng']))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df_content['content_'] = df_content['content_cut'].map(
        lambda x: ' '.join(x))
    return df_content
예제 #3
0
 def f(text):
     text = preprocessing.clean_content(text)
     text = modeling.get_key_sentences(text, num=1)
     return text
예제 #4
0
# -*- coding: utf-8 -*-

import os
import pandas as pd
from datetime import datetime
from utils import news_crawler
from utils import preprocessing

project_path = os.path.dirname(os.path.realpath(__file__))  # 获取项目路径
news_path = os.path.join(project_path, 'news')  # 新闻数据存放目录路径
if not os.path.exists(news_path):  # 创建news文件夹
    os.mkdir(news_path)

sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True)
sohu_news_df = news_crawler.get_latest_news('sohu', top=10, show_content=True)
xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=10, show_content=True)

news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv'))
news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv'))
news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv'))

news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True)
news_df = preprocessing.data_filter(news_df)
# last_time = datetime.today().strftime('%Y-%m-%d %H:%M')  # format like '2018-04-06 23:59'
# news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5)
news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x))

news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))