示例#1
0
def words_count(file):
    with codecs.open(file,'r','UTF-8','ignore') as file_in:
        sentence = file_in.read()
        words = Wakati.words_list(sentence)#全単語
        # words = Wakati.words_list_select(sentence)#品詞選択
        word_count = len(words)
    return word_count
示例#2
0
def corpus_to_dictionary(corpus):
    dictionary = {}
    docs = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(tqdm(zip(docs, corpus))):
        words = Wakati.words_list(doc)
        dictionary[name] = words
    return dictionary
示例#3
0
def vocabs_count(file):
    with codecs.open(file,'r','UTF-8','ignore') as file_in:
        sentence = file_in.read()
        words = Wakati.words_list(sentence)#全単語
        # words = Wakati.words_list_select(sentence)#品詞選択
        counter = Counter(words)
        vocab_count = len(counter)
    return vocab_count
示例#4
0
def doc2vec_sim_unknow(model, Target, topn):
    print("「" + Target + "(未知文書)の類似度」")
    with open(Target, 'r', encoding='UTF-8') as file_in:
        text = file_in.read()
        words_list = Wakati.words_list(text)
        vector = model.infer_vector(words_list)

    sims = model.docvecs.most_similar([vector], topn=20)
    return sims
示例#5
0
def words_vocab(dir):
    filelists = File_operation.get_all_paths(dir)
    print("ファイル数:" + str(len(filelists)))
    totalwords = []
    for file in filelists:
        with codecs.open(file,'r','UTF-8','ignore')as file_in:
            sentence = file_in.read()
        totalwords.extend(Wakati.words_list(sentence))

    counter = Counter(totalwords)
    vocab_count = len(counter)
    return vocab_count
示例#6
0
def doc2vec_cal_unknown(model, pos1, neg1, pos2, topn):
    pos1_word = pos1.split("\\")[-1]
    neg1_word = neg1.split("\\")[-1]
    pos2_word = pos2.split("\\")[-1]
    print("「" + pos1_word + "-" + neg1_word + "+" + pos2_word + "(未知文書)の演算」")
    with open(pos1, 'r', encoding='UTF-8') as file_in:
        text = file_in.read()
        words_list = Wakati.words_list(text)
        pos1_vec = model.infer_vector(words_list)
    with open(neg1, 'r', encoding='UTF-8') as file_in:
        text = file_in.read()
        words_list = Wakati.words_list(text)
        neg1_vec = model.infer_vector(words_list)
    with open(pos2, 'r', encoding='UTF-8') as file_in:
        text = file_in.read()
        words_list = Wakati.words_list(text)
        pos2_vec = model.infer_vector(words_list)

    sims = model.docvecs.most_similar(positive=[pos1_vec, pos2_vec],
                                      negative=[neg1_vec],
                                      topn=topn)
    return sims
示例#7
0
def all_count(file):
    with codecs.open(file,'r','UTF-8','ignore') as file_in:
        sentences = file_in.readlines()
    with codecs.open(file,'r','UTF-8','ignore') as file_in:
        sentence = file_in.read()
    words = Wakati.words_list(sentence)#全単語
    # words = Wakati.words_list_select(sentence)#品詞選択

    word_count = len(words)
    counter = Counter(words)
    vocab_count = len(counter)
    sentence_count = len(sentences)
    return word_count,vocab_count,sentence_count
示例#8
0
def concate_wakati(INPUT_DIR, OUTPUT_DIR, name):
    lists = get_all_paths(INPUT_DIR)
    with open(os.path.join(OUTPUT_DIR, name), 'w',
              encoding='UTF-8') as file_out:
        print("総ファイル数:" + str(len(lists)))
        for file in tqdm(lists):
            with open(file, 'r', encoding='UTF-8-sig') as file_in:
                lines = file_in.readlines()
                for line in lines:
                    text = Wakati.wakati(line)
                    if line == lines[-1] and file == lists[-1]:
                        text = text.replace("\n", "")
                        file_out.write(text)
                    else:
                        file_out.write(text)
示例#9
0
for i, file in enumerate(tweets_paths):
    try:
        fi = codecs.open(tweets_paths[i - 1], 'r', 'utf8')
        tweet_datas = json.load(fi)
        print(str(i + 1) + "×" + str(len(tweet_datas)) + "Tweets")
    except Exception:
        pass

    for j, tweet_data in enumerate(tweet_datas):
        text = tweet_data["text"].replace("\r", "").replace("\n", "")
        f_txt.write(text + "\n")
        text = text.replace(",", "")
        """文字列削除&単語分割"""
        text = Delete.delete_twitter(text)  #文字列削除

        text = Wakati.wakati(text)  #分かち書き
        f_pre.write(text)

        with open(os.path.join(
                save_dir_corpus_koko,
                save_dir_name + "_pre_" + str(filenumber) + ".txt"),
                  'w',
                  encoding='UTF-8') as file_koko:  #前処理(個々)
            file_koko.write(text)

        filenumber += 1
    fi.close()
f_txt.close()
f_pre.close()
示例#10
0
import sys
sys.path.append("..")
from Preprocessing import File_operation
from Preprocessing import Wakati

"""モデルの読み込み"""
INPUT_MODEL = XXXXXXXXXX
model = models.Doc2Vec.load(INPUT_MODEL)

"""読み込み先・保存先"""
keyword = XXXXXXXXXX
INPUT_DIR = XXXXXXXXXX
OUTPUT_DIR = XXXXXXXXXX
filelists = File_operation.get_all_paths(INPUT_DIR)

"""ベクトル化"""
dictionary = {}
for i,file in enumerate(tqdm(filelists)):
    title = file.split("\\")[-1]

    with codecs.open(file,'r','UTF-8',"ignore")as file_in:
        sentence = file_in.read()
        words = Wakati.words_list(sentence)
        vector = model.infer_vector(words)

    dictionary[title] = vector

"""pickleファイルに保存"""
with open(os.path.join(OUTPUT_DIR,keyword + ".pkl"), mode='wb') as f:
    pickle.dump(dictionary, f)