def main(): # Data preprocess labels_file = 'data/THUCNews_labels.txt' # word2vec_path = 'word2vec/THUCNews_word2vec300.model' word2vec_path = "../word2vec/models/THUCNews_word2Vec/THUCNews_word2Vec_128.model" models_path = 'models/checkpoints/model-30000' batch_size = 128 val_dir = './data/val_data' batch_predict(val_dir=val_dir, labels_file=labels_file, models_path=models_path, word2vec_path=word2vec_path, batch_size=batch_size) test_path = '/home/ubuntu/project/tfTest/THUCNews/my_test' files_list = files_processing.get_files_list(test_path, postfix='*.txt') text_predict(files_list, labels_file, models_path, word2vec_path, batch_size)
'segment_{}.txt'.format(i)) with open(file, 'rb') as f: document = f.read() document_cut = jieba.cut(document) sentence_segment = [] for word in document_cut: if word not in stopwords: sentence_segment.append(word) result = ' '.join(sentence_segment) result = result.encode('utf-8') with open(segment_out_name, 'wb') as f2: f2.write(result) # 对source中的txt文件进行分词,输出到segment目录中 file_list = files_processing.get_files_list(source_folder, postfix='*.txt') segment_lines(file_list, segment_folder) # 先运行 word_seg进行中文分词,然后再进行word_similarity计算 # 将Word转换成Vec,然后计算相似度 # 如果目录中有多个文件,可以使用PathLineSentences segment_folder = 'F:\PycharmProjects\word2vec\segment' sentences = word2vec.PathLineSentences(segment_folder) # 设置模型参数,进行训练 model = word2vec.Word2Vec(sentences, size=128, window=5, iter=1000, min_count=10, workers=multiprocessing.cpu_count())
label.write(str(type)) file_segment.write(result) file_segment.close() label.close() return segment_out_name, label_out_name def MergeTxt(filepath,outfile): k = open(filepath+outfile, 'a+', encoding='utf-8') for parent, dirnames, filenames in os.walk(filepath): for filepath in filenames: txtPath = os.path.join(parent, filepath) f = open(txtPath, encoding='utf-8') k.write(f.read()+"\n") if __name__=='__main__': # 多线程分词 # jieba.enable_parallel() # 加载自定义词典 user_path = 'data/n.txt' jieba.load_userdict(user_path) stopwords_path='data/stopwords.txt' stopwords=getStopwords(stopwords_path) file_dir='data/source/biaozhu' segment_out_dir='data/segment/biaozhu_property' file_list=files_processing.get_files_list(file_dir,postfix='*.txt') segment_lines(file_list, segment_out_dir, stopwords) # segment_lines(file_list, segment_out_dir)
# -*-coding: utf-8 -*- # 对txt文件进行中文分词 import jieba import os from utils import files_processing # 字词分割,对整个文件内容进行字词分割 def segment_lines(file_list,segment_out_dir,stopwords=[]): for i,file in enumerate(file_list): segment_out_name=os.path.join(segment_out_dir,'segment_{}.txt'.format(i)) with open(file, 'rb') as f: document = f.read() document_cut = jieba.cut(document) sentence_segment=[] for word in document_cut: if word not in stopwords: sentence_segment.append(word) result = ' '.join(sentence_segment) result = result.encode('utf-8') with open(segment_out_name, 'wb') as f2: f2.write(result) # 对source中的txt文件进行分词,输出到segment目录中 file_list=files_processing.get_files_list('./source', postfix='*.txt') segment_lines(file_list, './segment')