示例#1
0
def corpus_segment(corpus_path, seg_path):
    catelist = os.listdir(
        corpus_path)  # Gets all subdirectories under corpus_path
    ## In fact, the name of subdirectories is the category
    #print("Segmenting..Please wait.")

    # Gets all the files under each directory (category)
    for mydir in catelist:
        class_path = corpus_path + mydir + "/"
        seg_dir = seg_path + mydir + "/"
        if not os.path.exists(
                seg_dir
        ):  # Whether there is a word segmentation directory, if not, create it
            os.makedirs(seg_dir)
        file_list = os.listdir(
            class_path
        )  # Get all the text in a category in an unsegmented term repository

        # Traverse all files in the category directory and to process
        for file_path in file_list:
            fullname = class_path + file_path
            content = readfile(fullname)
            content = content.replace(
                '\r\n'.encode('utf-8'),
                ''.encode('utf-8')).strip()  # Delete line breaks
            content = content.replace(
                ' '.encode('utf-8'),
                ''.encode('utf-8')).strip()  # Delete empty lines, extra spaces
            content_seg = jieba.cut(content)  # segment
            savefile(seg_dir + file_path, ' '.join(content_seg).encode(
                'utf-8'))  # Save the segmented file
示例#2
0
文件: fenci.py 项目: Lumf111/text_cf
def corpus_segment(corpus_path, seg_path):
    catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录

    print("玩儿命分词中...")

    # 获取每个目录(类别)下所有的文件

    for mydir in catelist:

        class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径

        seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径
        if not os.path.exists(seg_dir):  # 是否存在分词目录,如果没有则创建该目录

            os.makedirs(seg_dir)
        file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本
        for file_path in file_list:  # 遍历类别目录下的所有文件

            fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/art/21.txt

            content = readfile(fullname)  # 读取文件内容

            content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行

            content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除空行、多余的空格

            content_seg = jieba.cut(content)  # 为文件内容分词

            savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录
    print("中文语料分词结束!!!")
示例#3
0
def predicted_classifier():
    corpus_path = "./upload_corpus/"
    content = request.args.get('content', '')
    hl = md5()
    hl.update(content.encode(encoding='utf-8'))
    filename =  hl.hexdigest()
    savefile(corpus_path + filename + ".txt" , content.encode('utf-8')) 
    data = dict(class_type=classifier(filename))
    resp = jsonify(data)
    return resp
def corpus_segment(corpus_path, seg_path):
    '''
    corpus_path是未分词语料库路径
    seg_path是分词后语料库存储路径
    '''
    catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录
    '''
    其中子目录的名字就是类别名
    train_corpus/it/21.txt中,'train_corpus/'是corpus_path,'it'是catelist中的一个成员
    '''
    print("玩儿命分词中...")
    # 获取每个目录(类别)下所有的文件
    for mydir in catelist:
        '''
        这里mydir就是train_corpus/it/21.txt中的it(即catelist中的一个类别)
        '''
        class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径如:train_corpus/it/
        seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径如:train_corpus_seg/it/

        if not os.path.exists(seg_dir):  # 是否存在分词目录,如果没有则创建该目录
            os.makedirs(seg_dir)

        file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本
        '''
        train_corpus/it/中的
        21.txt,
        22.txt,
        23.txt
        ...
        file_list=['21.txt','22.txt',...]
        '''
        for file_path in file_list:  # 遍历类别目录下的所有文件
            fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/it/21.txt
            content = readfile(fullname)  # 读取文件内容
            '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等,
            接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容
            '''
            content = content.replace('\r\n'.encode('utf-8'),
                                      ''.encode('utf-8')).strip()  # 删除换行
            content = content.replace(' '.encode('utf-8'),
                                      ''.encode('utf-8')).strip()  # 删除空行、多余的空格
            content_seg = jieba.cut(content)  # 为文件内容分词
            savefile(
                seg_dir + file_path,
                ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录

    print("中文语料分词结束!!!")
示例#5
0
def corpus_segment(filename):
    # 对上传文件进行分词
    corpus_path = "./upload_corpus/" + filename + ".txt"  # 未分词分类语料库路径
    seg_path = "./upload_corpus_seg/" + filename + ".txt"  # 分词后分类语料库路径
    '''
    corpus_path是未分词语料库路径
    seg_path是分词后语料库存储路径
    '''
    content = readfile(corpus_path)  # 读取文件内容
    '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等,
    接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容
    '''
    content = content.replace('\r\n'.encode('utf-8'),
                              ''.encode('utf-8')).strip()  # 删除换行
    content = content.replace(' '.encode('utf-8'),
                              ''.encode('utf-8')).strip()  # 删除空行、多余的空格
    content_seg = jieba.cut(content)  # 为文件内容分词
    savefile(seg_path,
             ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录
def corpus_segment(corpus_path, seg_path):
    '''
    corpus_path is the path for file before division
    seg_path is the path for file after division
    '''
    catelist = os.listdir(corpus_path)
    '''
    catelist record all the folder names in the corpus_path, including 'art','literature','education'...
   
    '''
    print("the jieba is working")
    # to obtain the file under each folder
    for mydir in catelist:

        class_path = corpus_path + mydir + "/"  # train_corpus/art/
        seg_dir = seg_path + mydir + "/"  # train_corpus_seg/art/

        if not os.path.exists(seg_dir):  # create the train_corpus_seg
            os.makedirs(seg_dir)

        file_list = os.listdir(class_path)

        for file_path in file_list:  # visit all the file under file_list
            fullname = class_path + file_path  # give the full path:train_corpus/art/21.txt
            content = readfile(fullname)  #read the .txt file
            '''delete the white space,null string,return 
            '''
            content = content.replace(
                '\r\n'.encode('utf-8'),
                ''.encode('utf-8')).strip()  # delete return
            content = content.replace(
                ' '.encode('utf-8'),
                ''.encode('utf-8')).strip()  # delete white space
            content_seg = jieba.cut(content)  # 为文件内容分词
            savefile(seg_dir + file_path,
                     ' '.join(content_seg).encode('utf-8'))
            # put the file after division into seg_path

    print("the division of sentences is finished!!!")
示例#7
0
@file: fetch.py
@time: 2018/8/4 16:12
@software: PyCharm
"""
from bs4 import BeautifulSoup
import requests
import os
from Tools import savefile

# 财经资讯语料库抓取
corpus_path = "./train_corpus/C2-Financial/C2-Financial"
count = 0
start = 338000
end = 338100
for article_id in range(start, end):
    url = 'https://wallstreetcn.com/articles/' + str(article_id)
    res = requests.get(url)
    html = res.text
    """
    soup 表示被解析的html格式的内容
    html.parser表示解析用的解析器
    """
    soup = BeautifulSoup(html, "html.parser")
    soup_content = soup.find_all("div", class_="node-article-content")
    if soup_content:
        text = soup_content[0].get_text()
        count += 1
        file_path = corpus_path + str(count) + '.txt'
        savefile(file_path, text.encode('utf-8'))
        print(text)
        print(file_path)