def words_total(dir): filelists = File_operation.get_all_paths(dir) print("ファイル数:" + str(len(filelists))) totalwords = 0 for file in filelists: totalwords += words_count(file) return totalwords
def words_max(dir): filelists = File_operation.get_all_paths(dir) print("ファイル数:" + str(len(filelists))) maxwords = 0 name = "" for file in filelists: words = words_count(file) if words >= maxwords: maxwords = words name = file return maxwords,name
def words_average(dir): filelists = File_operation.get_all_paths(dir) print("ファイル数:" + str(len(filelists))) if len(filelists) == 0: averagewords = 0 else: totalwords = 0 for file in filelists: totalwords += words_count(file) averagewords = totalwords / len(filelists) return averagewords
def words_vocab(dir): filelists = File_operation.get_all_paths(dir) print("ファイル数:" + str(len(filelists))) totalwords = [] for file in filelists: with codecs.open(file,'r','UTF-8','ignore')as file_in: sentence = file_in.read() totalwords.extend(Wakati.words_list(sentence)) counter = Counter(totalwords) vocab_count = len(counter) return vocab_count
def words_min(dir): filelists = File_operation.get_all_paths(dir) print("ファイル数:" + str(len(filelists))) if len(filelists) == 0: minwords = 0 name = None else: minwords = math.inf for file in filelists: words = words_count(file) if words <= minwords: minwords = words name = file return minwords,name
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import os from tqdm import tqdm import sys sys.path.append("..") from Preprocessing import File_operation INPUT_DIR = XXXXXXXXXX filelist = File_operation.get_all_paths(INPUT_DIR) OUTPUT_DIR = XXXXXXXXXX with open(os.path.join(OUTPUT_DIR,"all_path(EncyclopediaMypedia58312).csv"),'w',encoding='UTF-8-sig')as file_out: for i,file in enumerate(tqdm(filelist)): name = file.replace(",","") file_out.write(str(i+1) + "," + name + "\n")
import re import os from tqdm import tqdm import sys sys.path.append("..") from Preprocessing import File_operation INPUT_DIR = XXXXXXXXXX filelist = os.listdir(INPUT_DIR) OUTPUT_DIR = rXXXXXXXXXX with open(os.path.join(OUTPUT_DIR, "Livedoor_info.csv"), 'w', encoding='UTF-8-sig') as file_out: #書き込み用 for list in tqdm(filelist): count = 0 print("ジャンル:" + list) filelists = File_operation.get_all_paths(os.path.join(INPUT_DIR, list)) file_out.write("URL,投稿日時,ジャンル,タイトル\n") for file in filelists: count += 1 with open(file, 'r', encoding='UTF-8') as file_in: #読み込み用 lines = file_in.readlines() file_out.write(lines[0].replace("\n", "").replace(",", "") + "," + lines[1].replace("\n", "").replace(",", "") + "," + list + "," + lines[2].replace("\n", "").replace(",", "") + "\n")
import re import os from tqdm import tqdm import sys sys.path.append("..") from Preprocessing import File_operation from Preprocessing import Delete from Preprocessing import Sentence INPUT_DIR = XXXXXXXXXX dirlist = os.listdir(INPUT_DIR) OUTPUT_DIR = XXXXXXXXXX for dir in dirlist: print(dir) filelist = File_operation.get_all_paths(os.path.join(INPUT_DIR, dir)) for i, file in enumerate(tqdm(filelist)): with open(file, 'r', encoding='UTF-8') as file_in: lines = file_in.readlines() title = lines[2] title = Delete.title(title) OUT = os.path.join(OUTPUT_DIR, dir) os.makedirs(OUT, exist_ok=True) with open(os.path.join(OUT, title + ".txt"), 'w', encoding='UTF-8') as file_out: for line in lines[3:]: if not line == "\n": sentencelists = Sentence.sentence_novel(line) for sentence in sentencelists: text = Delete.delete_wikipedia(sentence) if not text == "":
INPUT_DIR = XXXXXXXXXX OUTPUT_DIR = XXXXXXXXXX """ファイルから文書を取得""" def read_document(path): with open(path, 'r', encoding='UTF-8', errors='ignore') as f: return f.read() """文書のtag(フルパス)と単語のリストをdictionaryで取得""" def corpus_to_dictionary(corpus): dictionary = {} docs = [read_document(x) for x in corpus] for idx, (doc, name) in enumerate(tqdm(zip(docs, corpus))): words = Wakati.words_list(doc) dictionary[name] = words return dictionary if __name__ == '__main__': print(keyword) corpus = File_operation.get_all_paths(INPUT_DIR) dictionary = corpus_to_dictionary(corpus) with open(os.path.join(OUTPUT_DIR, keyword + ".json"), 'w', encoding='UTF-8') as file_out: json.dump(dictionary, file_out)
# rank = [docid for docid, sim in sims].index(sentences[doc_id].tags[0]) # ranks.append(rank) # print(collections.Counter(ranks)) model.save( os.path.join(OUTPUT_DIR, OUTPUT_MODEL_NAME + "_" + str(x + 1) + ".model")) return model if __name__ == '__main__': sentences = [] if os.path.isdir(INPUT) == True: print("Multi") filelists = File_operation.get_all_paths(INPUT) for i, file in enumerate(filelists): with open(file, 'r', encoding='UTF-8') as f: json_datas = json.load(f) sentence = corpus_to_sentences(json_datas) sentences.extend(sentence) print(len(sentence)) else: print("One") with open(INPUT, 'r', encoding='UTF-8') as f: json_datas = json.load(f) sentence = corpus_to_sentences(json_datas) sentences.extend(sentence) print(len(sentence)) print("ファイル数:" + str(len(sentences))) train(sentences)