示例#1
0
def words_total(dir):
    filelists = File_operation.get_all_paths(dir)
    print("ファイル数:" + str(len(filelists)))
    totalwords = 0
    for file in filelists:
        totalwords += words_count(file)
    return totalwords
示例#2
0
def words_max(dir):
    filelists = File_operation.get_all_paths(dir)
    print("ファイル数:" + str(len(filelists)))
    maxwords = 0
    name = ""
    for file in filelists:
        words = words_count(file)
        if words >= maxwords:
            maxwords = words
            name = file
    return maxwords,name
示例#3
0
def words_average(dir):
    filelists = File_operation.get_all_paths(dir)
    print("ファイル数:" + str(len(filelists)))
    if len(filelists) == 0:
        averagewords = 0
    else:
        totalwords = 0
        for file in filelists:
            totalwords += words_count(file)
        averagewords = totalwords / len(filelists)
    return averagewords
示例#4
0
def words_vocab(dir):
    filelists = File_operation.get_all_paths(dir)
    print("ファイル数:" + str(len(filelists)))
    totalwords = []
    for file in filelists:
        with codecs.open(file,'r','UTF-8','ignore')as file_in:
            sentence = file_in.read()
        totalwords.extend(Wakati.words_list(sentence))

    counter = Counter(totalwords)
    vocab_count = len(counter)
    return vocab_count
示例#5
0
def words_min(dir):
    filelists = File_operation.get_all_paths(dir)
    print("ファイル数:" + str(len(filelists)))
    if len(filelists) == 0:
        minwords = 0
        name = None
    else:
        minwords = math.inf
        for file in filelists:
            words = words_count(file)
            if words <= minwords:
                minwords = words
                name = file
    return minwords,name
示例#6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import os
from tqdm import tqdm
import sys
sys.path.append("..")
from Preprocessing import File_operation

INPUT_DIR = XXXXXXXXXX
filelist = File_operation.get_all_paths(INPUT_DIR)
OUTPUT_DIR = XXXXXXXXXX

with open(os.path.join(OUTPUT_DIR,"all_path(EncyclopediaMypedia58312).csv"),'w',encoding='UTF-8-sig')as file_out:
    for i,file in enumerate(tqdm(filelist)):
        name = file.replace(",","")
        file_out.write(str(i+1) + "," + name + "\n")
import re
import os
from tqdm import tqdm
import sys
sys.path.append("..")
from Preprocessing import File_operation

INPUT_DIR = XXXXXXXXXX
filelist = os.listdir(INPUT_DIR)
OUTPUT_DIR = rXXXXXXXXXX

with open(os.path.join(OUTPUT_DIR, "Livedoor_info.csv"),
          'w',
          encoding='UTF-8-sig') as file_out:  #書き込み用
    for list in tqdm(filelist):
        count = 0
        print("ジャンル:" + list)
        filelists = File_operation.get_all_paths(os.path.join(INPUT_DIR, list))
        file_out.write("URL,投稿日時,ジャンル,タイトル\n")
        for file in filelists:
            count += 1
            with open(file, 'r', encoding='UTF-8') as file_in:  #読み込み用
                lines = file_in.readlines()
                file_out.write(lines[0].replace("\n", "").replace(",", "") +
                               "," +
                               lines[1].replace("\n", "").replace(",", "") +
                               "," + list + "," +
                               lines[2].replace("\n", "").replace(",", "") +
                               "\n")
import re
import os
from tqdm import tqdm
import sys
sys.path.append("..")
from Preprocessing import File_operation
from Preprocessing import Delete
from Preprocessing import Sentence

INPUT_DIR = XXXXXXXXXX
dirlist = os.listdir(INPUT_DIR)
OUTPUT_DIR = XXXXXXXXXX

for dir in dirlist:
    print(dir)
    filelist = File_operation.get_all_paths(os.path.join(INPUT_DIR, dir))
    for i, file in enumerate(tqdm(filelist)):
        with open(file, 'r', encoding='UTF-8') as file_in:
            lines = file_in.readlines()
            title = lines[2]
            title = Delete.title(title)
            OUT = os.path.join(OUTPUT_DIR, dir)
            os.makedirs(OUT, exist_ok=True)
            with open(os.path.join(OUT, title + ".txt"), 'w',
                      encoding='UTF-8') as file_out:
                for line in lines[3:]:
                    if not line == "\n":
                        sentencelists = Sentence.sentence_novel(line)
                        for sentence in sentencelists:
                            text = Delete.delete_wikipedia(sentence)
                            if not text == "":
示例#9
0
INPUT_DIR = XXXXXXXXXX
OUTPUT_DIR = XXXXXXXXXX
"""ファイルから文書を取得"""


def read_document(path):
    with open(path, 'r', encoding='UTF-8', errors='ignore') as f:
        return f.read()


"""文書のtag(フルパス)と単語のリストをdictionaryで取得"""


def corpus_to_dictionary(corpus):
    dictionary = {}
    docs = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(tqdm(zip(docs, corpus))):
        words = Wakati.words_list(doc)
        dictionary[name] = words
    return dictionary


if __name__ == '__main__':
    print(keyword)
    corpus = File_operation.get_all_paths(INPUT_DIR)
    dictionary = corpus_to_dictionary(corpus)
    with open(os.path.join(OUTPUT_DIR, keyword + ".json"),
              'w',
              encoding='UTF-8') as file_out:
        json.dump(dictionary, file_out)
        #     rank = [docid for docid, sim in sims].index(sentences[doc_id].tags[0])
        #     ranks.append(rank)
        # print(collections.Counter(ranks))

        model.save(
            os.path.join(OUTPUT_DIR,
                         OUTPUT_MODEL_NAME + "_" + str(x + 1) + ".model"))

    return model


if __name__ == '__main__':
    sentences = []
    if os.path.isdir(INPUT) == True:
        print("Multi")
        filelists = File_operation.get_all_paths(INPUT)
        for i, file in enumerate(filelists):
            with open(file, 'r', encoding='UTF-8') as f:
                json_datas = json.load(f)
                sentence = corpus_to_sentences(json_datas)
                sentences.extend(sentence)
                print(len(sentence))
    else:
        print("One")
        with open(INPUT, 'r', encoding='UTF-8') as f:
            json_datas = json.load(f)
            sentence = corpus_to_sentences(json_datas)
            sentences.extend(sentence)
            print(len(sentence))
    print("ファイル数:" + str(len(sentences)))
    train(sentences)