def preprocess(): if filter_df0.shape[0] == 0: messagebox.showinfo('Message', '未选择新闻数据!') return df = filter_df0.copy() df['title_'] = df['title'].map( lambda x: preprocessing.clean_title_blank(x)) df['content_'] = df['content'].map( lambda x: preprocessing.clean_content(x)) df['content_'] = df['content_'].map( lambda x: preprocessing.get_num_en_ch(x)) df['content_cut'] = df['content_'].map(lambda x: preprocessing.pseg_cut( x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt'))) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.get_words_by_flags( x, flags=['n.*', 'v.*', 'eng', 't', 's', 'j', 'l', 'i'])) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.stop_words_cut( x, os.path.join(extra_dict_path, 'self_stop_words.txt'))) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.disambiguation_cut( x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json'))) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.individual_character_cut( x, os.path.join(extra_dict_path, 'self_individual_character_dict.txt') )) df['content_'] = df['content_cut'].map(lambda x: ' '.join(x)) news_pandas.save_news(df, os.path.join(temp_news_path, 'news_cut.csv')) messagebox.showinfo('Message', '数据预处理完成!')
def content_preprocess(df_content): """新闻内容分词处理""" df_content['content_'] = df_content['content'].map( lambda x: preprocessing.clean_content(x)) df_content['content_'] = df_content['content_'].map( lambda x: preprocessing.get_num_en_ch(x)) df_content['content_cut'] = df_content['content_'].map( lambda x: preprocessing.pseg_cut( x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt'))) df_content['content_cut'] = df_content['content_cut'].map( lambda x: preprocessing.get_words_by_flags( x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng'])) df_content['content_cut'] = df_content['content_cut'].map( lambda x: preprocessing.stop_words_cut( x, os.path.join(extra_dict_path, 'HIT_stop_words.txt'))) df_content['content_cut'] = df_content['content_cut'].map( lambda x: preprocessing.stop_words_cut( x, os.path.join(extra_dict_path, 'self_stop_words.txt'))) df_content['content_cut'] = df_content['content_cut'].map( lambda x: preprocessing.disambiguation_cut( x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json'))) df_content['content_cut'] = df_content['content_cut'].map( lambda x: preprocessing.individual_character_cut( x, os.path.join(extra_dict_path, 'self_individual_character_dict.txt') )) df_content['content_'] = df_content['content_cut'].map( lambda x: ' '.join(x)) return df_content
def f(text): text = preprocessing.clean_content(text) text = modeling.get_key_sentences(text, num=1) return text
# -*- coding: utf-8 -*- import os import pandas as pd from datetime import datetime from utils import news_crawler from utils import preprocessing project_path = os.path.dirname(os.path.realpath(__file__)) # 获取项目路径 news_path = os.path.join(project_path, 'news') # 新闻数据存放目录路径 if not os.path.exists(news_path): # 创建news文件夹 os.mkdir(news_path) sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True) sohu_news_df = news_crawler.get_latest_news('sohu', top=10, show_content=True) xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=10, show_content=True) news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv')) news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv')) news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv')) news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) news_df = preprocessing.data_filter(news_df) # last_time = datetime.today().strftime('%Y-%m-%d %H:%M') # format like '2018-04-06 23:59' # news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5) news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x)) news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))