示例#1
0
def main():
    # Data preprocess
    labels_file = 'data/THUCNews_labels.txt'
    # word2vec_path = 'word2vec/THUCNews_word2vec300.model'
    word2vec_path = "../word2vec/models/THUCNews_word2Vec/THUCNews_word2Vec_128.model"
    models_path = 'models/checkpoints/model-30000'
    batch_size = 128
    val_dir = './data/val_data'

    batch_predict(val_dir=val_dir,
                  labels_file=labels_file,
                  models_path=models_path,
                  word2vec_path=word2vec_path,
                  batch_size=batch_size)

    test_path = '/home/ubuntu/project/tfTest/THUCNews/my_test'
    files_list = files_processing.get_files_list(test_path, postfix='*.txt')
    text_predict(files_list, labels_file, models_path, word2vec_path,
                 batch_size)
示例#2
0
                                        'segment_{}.txt'.format(i))
        with open(file, 'rb') as f:
            document = f.read()
            document_cut = jieba.cut(document)
            sentence_segment = []
            for word in document_cut:
                if word not in stopwords:
                    sentence_segment.append(word)
            result = ' '.join(sentence_segment)
            result = result.encode('utf-8')
            with open(segment_out_name, 'wb') as f2:
                f2.write(result)


# 对source中的txt文件进行分词,输出到segment目录中
file_list = files_processing.get_files_list(source_folder, postfix='*.txt')
segment_lines(file_list, segment_folder)
# 先运行 word_seg进行中文分词,然后再进行word_similarity计算
# 将Word转换成Vec,然后计算相似度

# 如果目录中有多个文件,可以使用PathLineSentences
segment_folder = 'F:\PycharmProjects\word2vec\segment'
sentences = word2vec.PathLineSentences(segment_folder)

# 设置模型参数,进行训练
model = word2vec.Word2Vec(sentences,
                          size=128,
                          window=5,
                          iter=1000,
                          min_count=10,
                          workers=multiprocessing.cpu_count())
示例#3
0
                label.write(str(type))
            
            file_segment.write(result)
        file_segment.close()
        label.close()

        return segment_out_name, label_out_name

    def MergeTxt(filepath,outfile):
        k = open(filepath+outfile, 'a+', encoding='utf-8')
        for parent, dirnames, filenames in os.walk(filepath):
            for filepath in filenames:
                txtPath = os.path.join(parent, filepath) 
                f = open(txtPath, encoding='utf-8')
                k.write(f.read()+"\n")

if __name__=='__main__':
    # 多线程分词
    # jieba.enable_parallel()
    # 加载自定义词典
    user_path = 'data/n.txt'
    jieba.load_userdict(user_path)
 
    stopwords_path='data/stopwords.txt'
    stopwords=getStopwords(stopwords_path)
 
    file_dir='data/source/biaozhu'
    segment_out_dir='data/segment/biaozhu_property'
    file_list=files_processing.get_files_list(file_dir,postfix='*.txt')
    segment_lines(file_list, segment_out_dir, stopwords)
    # segment_lines(file_list, segment_out_dir)
# -*-coding: utf-8 -*-
# 对txt文件进行中文分词
import jieba
import os
from utils import files_processing

# 字词分割,对整个文件内容进行字词分割
def segment_lines(file_list,segment_out_dir,stopwords=[]):
    for i,file in enumerate(file_list):
        segment_out_name=os.path.join(segment_out_dir,'segment_{}.txt'.format(i))
        with open(file, 'rb') as f:
            document = f.read()
            document_cut = jieba.cut(document)
            sentence_segment=[]
            for word in document_cut:
                if word not in stopwords:
                    sentence_segment.append(word)
            result = ' '.join(sentence_segment)
            result = result.encode('utf-8')
            with open(segment_out_name, 'wb') as f2:
                f2.write(result)

# 对source中的txt文件进行分词,输出到segment目录中
file_list=files_processing.get_files_list('./source', postfix='*.txt')
segment_lines(file_list, './segment')