def preprocess(): if filter_df0.shape[0] == 0: messagebox.showinfo('Message', '未选择新闻数据!') return df = filter_df0.copy() df['title_'] = df['title'].map( lambda x: preprocessing.clean_title_blank(x)) df['content_'] = df['content'].map( lambda x: preprocessing.clean_content(x)) df['content_'] = df['content_'].map( lambda x: preprocessing.get_num_en_ch(x)) df['content_cut'] = df['content_'].map(lambda x: preprocessing.pseg_cut( x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt'))) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.get_words_by_flags( x, flags=['n.*', 'v.*', 'eng', 't', 's', 'j', 'l', 'i'])) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.stop_words_cut( x, os.path.join(extra_dict_path, 'self_stop_words.txt'))) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.disambiguation_cut( x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json'))) df['content_cut'] = df['content_cut'].map( lambda x: preprocessing.individual_character_cut( x, os.path.join(extra_dict_path, 'self_individual_character_dict.txt') )) df['content_'] = df['content_cut'].map(lambda x: ' '.join(x)) news_pandas.save_news(df, os.path.join(temp_news_path, 'news_cut.csv')) messagebox.showinfo('Message', '数据预处理完成!')
def title_preprocess(df_title): """标题分词处理""" df_title['title_'] = df_title['title'].map( lambda x: preprocessing.clean_title(x)) df_title['title_'] = df_title['title_'].map( lambda x: preprocessing.get_num_en_ch(x)) df_title['title_cut'] = df_title['title_'].map( lambda x: preprocessing.pseg_cut( x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt'))) df_title['title_cut'] = df_title['title_cut'].map( lambda x: preprocessing.get_words_by_flags( x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng'])) df_title['title_cut'] = df_title['title_cut'].map( lambda x: preprocessing.stop_words_cut( x, os.path.join(extra_dict_path, 'HIT_stop_words.txt'))) df_title['title_cut'] = df_title['title_cut'].map( lambda x: preprocessing.stop_words_cut( x, os.path.join(extra_dict_path, 'self_stop_words.txt'))) df_title['title_cut'] = df_title['title_cut'].map( lambda x: preprocessing.disambiguation_cut( x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json'))) df_title['title_cut'] = df_title['title_cut'].map( lambda x: preprocessing.individual_character_cut( x, os.path.join(extra_dict_path, 'self_individual_character_dict.txt') )) df_title['title_'] = df_title['title_cut'].map(lambda x: ' '.join(x)) return df_title