def title_cluster(df, save_df=False): """按新闻标题聚类""" df_title = df.copy() df_title = title_preprocess(df_title) word_library_list = counter.get_word_library(df_title['title_cut']) single_frequency_words_list = counter.get_single_frequency_words(df_title['title_cut']) max_features = len(word_library_list) - len(single_frequency_words_list) // 2 title_matrix = modeling.feature_extraction(df_title['title_'], vectorizer='CountVectorizer', vec_args={'max_df': 1.0, 'min_df': 1, 'max_features': max_features}) title_dbscan = modeling.get_cluster(title_matrix, cluster='DBSCAN', cluster_args={'eps': 0.4, 'min_samples': 4, 'metric': 'cosine'}) title_labels = modeling.get_labels(title_dbscan) df_title['title_label'] = title_labels df_non_outliers = modeling.get_non_outliers_data(df_title, label_column='title_label') title_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['title_label'].tolist()) print('按新闻标题聚类,一共有%d个簇(不包括离群点)' % title_label_num) title_rank = modeling.label2rank(title_labels) df_title['title_rank'] = title_rank for i in range(1, title_label_num + 1): df_ = df_title[df_title['title_rank'] == i] title_top_list = counter.get_most_common_words(df_['title_cut'], top_n=10) print(title_top_list) if save_df: df_title.drop(['content', 'title_', 'title_label'], axis=1, inplace=True) news_crawler.save_news(df_title, os.path.join(results_path, 'df_title_rank.csv')) return df_title
def content_cluster(df, df_save=False): """按新闻内容聚类""" df_content = df.copy() df_content = content_preprocess(df_content) word_library_list = counter.get_word_library(df_content['content_cut']) single_frequency_words_list = counter.get_single_frequency_words(df_content['content_cut']) max_features = len(word_library_list) - len(single_frequency_words_list) // 2 content_matrix = modeling.feature_extraction(df_content['content_'], vectorizer='CountVectorizer', vec_args={'max_df': 0.95, 'min_df': 1, 'max_features': max_features}) content_dbscan = modeling.get_cluster(content_matrix, cluster='DBSCAN', cluster_args={'eps': 0.35, 'min_samples': 4, 'metric': 'cosine'}) content_labels = modeling.get_labels(content_dbscan) df_content['content_label'] = content_labels df_non_outliers = modeling.get_non_outliers_data(df_content, label_column='content_label') content_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['content_label'].tolist()) print('按新闻内容聚类,一共有%d个簇(不包括离群点)' % content_label_num) content_rank = modeling.label2rank(content_labels) df_content['content_rank'] = content_rank for i in range(1, content_label_num + 1): df_ = df_content[df_content['content_rank'] == i] content_top_list = counter.get_most_common_words(df_['content_cut'], top_n=15, min_frequency=1) print(content_top_list) if df_save: df_content.drop(['content_', 'content_label'], axis=1, inplace=True) news_crawler.save_news(df_content, os.path.join(results_path, 'df_content_rank.csv')) return df_content
def key_content(df, df_save=False): """获取摘要""" def f(text): text = preprocessing.clean_content(text) text = modeling.get_key_sentences(text, num=1) return text df['abstract'] = df['content'].map(f) if df_save: df.drop(['content'], axis=1, inplace=True) news_crawler.save_news(df, os.path.join(results_path, 'df_abstract.csv')) return df
import os import pandas as pd from datetime import datetime from utils import news_crawler from utils import preprocessing project_path = os.path.dirname(os.path.realpath(__file__)) # 获取项目路径 news_path = os.path.join(project_path, 'news') # 新闻数据存放目录路径 if not os.path.exists(news_path): # 创建news文件夹 os.mkdir(news_path) #sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True) #sohu_news_df = news_crawler.get_latest_news('sohu', top=100, show_content=True) xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=100000, show_content=True) #news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv')) #news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv')) news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv')) #news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) #news_df = preprocessing.data_filter(news_df) # last_time = datetime.today().strftime('%Y-%m-%d %H:%M') # format like '2018-04-06 23:59' # news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5) #news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x)) #news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))
# -*- coding: utf-8 -*- import os import pandas as pd from datetime import datetime from utils import news_crawler from utils import preprocessing project_path = os.path.dirname(os.path.realpath(__file__)) # 获取项目路径 news_path = os.path.join(project_path, 'news') # 新闻数据存放目录路径 if not os.path.exists(news_path): # 创建news文件夹 os.mkdir(news_path) sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True) sohu_news_df = news_crawler.get_latest_news('sohu', top=10, show_content=True) xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=10, show_content=True) news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv')) news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv')) news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv')) news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) news_df = preprocessing.data_filter(news_df) # last_time = datetime.today().strftime('%Y-%m-%d %H:%M') # format like '2018-04-06 23:59' # news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5) news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x)) news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))