Python clean_content 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.preprocessing

메소드/함수: clean_content

hotexamples.com에서의 예제들: 4

Python clean_content - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.preprocessing.clean_content에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: application.py 프로젝트: dudumiumiumiu/datamining_final_homework

def preprocess():
    if filter_df0.shape[0] == 0:
        messagebox.showinfo('Message', '未选择新闻数据！')
        return
    df = filter_df0.copy()
    df['title_'] = df['title'].map(
        lambda x: preprocessing.clean_title_blank(x))
    df['content_'] = df['content'].map(
        lambda x: preprocessing.clean_content(x))
    df['content_'] = df['content_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df['content_cut'] = df['content_'].map(lambda x: preprocessing.pseg_cut(
        x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', 'v.*', 'eng', 't', 's', 'j', 'l', 'i']))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df['content_'] = df['content_cut'].map(lambda x: ' '.join(x))
    news_pandas.save_news(df, os.path.join(temp_news_path, 'news_cut.csv'))
    messagebox.showinfo('Message', '数据预处理完成！')

예제 #2

파일 보기

파일: main.py 프로젝트: jingwangfei/HotNewsAnalysis

def content_preprocess(df_content):
    """新闻内容分词处理"""
    df_content['content_'] = df_content['content'].map(
        lambda x: preprocessing.clean_content(x))
    df_content['content_'] = df_content['content_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df_content['content_cut'] = df_content['content_'].map(
        lambda x: preprocessing.pseg_cut(
            x,
            userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng']))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df_content['content_cut'] = df_content['content_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df_content['content_'] = df_content['content_cut'].map(
        lambda x: ' '.join(x))
    return df_content

예제 #3

파일 보기

파일: hot_news.py 프로젝트: zhanglv0209/HotNewsAnalysis

 def f(text):
     text = preprocessing.clean_content(text)
     text = modeling.get_key_sentences(text, num=1)
     return text

예제 #4

파일 보기

# -*- coding: utf-8 -*-

import os
import pandas as pd
from datetime import datetime
from utils import news_crawler
from utils import preprocessing

project_path = os.path.dirname(os.path.realpath(__file__))  # 获取项目路径
news_path = os.path.join(project_path, 'news')  # 新闻数据存放目录路径
if not os.path.exists(news_path):  # 创建news文件夹
    os.mkdir(news_path)

sina_news_df = news_crawler.get_latest_news('sina', top=60, show_content=True)
sohu_news_df = news_crawler.get_latest_news('sohu', top=10, show_content=True)
xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=10, show_content=True)

news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv'))
news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv'))
news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv'))

news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True)
news_df = preprocessing.data_filter(news_df)
# last_time = datetime.today().strftime('%Y-%m-%d %H:%M')  # format like '2018-04-06 23:59'
# news_df = preprocessing.get_data(news_df, last_time=last_time, delta=5)
news_df['content'] = news_df['content'].map(lambda x: preprocessing.clean_content(x))

news_crawler.save_news(news_df, os.path.join(news_path, 'latest_news.csv'))