示例#1
0
import tensorflow as tf
import numpy as np
import tool as tool
import time

# data loading
data_path = './newscorpus.csv'
title, contents = tool.loading_data(data_path, eng=False, num=False, punc=False)
test_title, test_content = tool.loading_data("sample.csv", eng=False, num=False, punc=False)
for i in range(len(test_title)):
    test_title[i] = ""
word_to_ix, ix_to_word = tool.make_dict_all_cut(title + contents + test_content, minlength=0, maxlength=3,
                                                jamo_delete=True)

# parameters
multi = True
forward_only = False
hidden_size = 300
vocab_size = len(ix_to_word)
num_layers = 3
learning_rate = 0.001
batch_size = 16
encoder_size = 100
decoder_size = tool.check_doclength(title, sep=True)  # (Maximum) number of time steps in this batch
steps_per_checkpoint = 20

# transform data
encoderinputs, decoderinputs, targets_, targetweights = \
    tool.make_inputs(contents, title, word_to_ix,
                     encoder_size=encoder_size, decoder_size=decoder_size, shuffle=False)
test_encoderinputs, test_decoderinputs, test_targets_, test_targetweights = \
示例#2
0
            result.append([3])
        elif a == "직장생활":
            result.append([4])
        elif a == "진로":
            result.append([5])
        elif a == "친구":
            result.append([6])
        elif a == "이성":
            result.append([7])
        elif a == "이웃":
            result.append([8])
        elif a == "성격":
            result.append([9])

word_to_ix, ix_to_word = tool.make_dict_all_cut(contents,
                                                minlength=0,
                                                maxlength=3,
                                                jamo_delete=True)
encoder_size = 100
decoder_size = 3
encoderinputs, decoderinputs, targets_, targetweights = \
    tool.make_inputs(contents, title, word_to_ix,
                     encoder_size=encoder_size, decoder_size=decoder_size, shuffle=False)
# encoderinputs & result
mix = []
for i in range(len(encoderinputs)):
    mix.append(encoderinputs[i] + result[i])


def train_test_split(records, testratio=0.2):
    np.random.seed(int(len(records) / 2))
    np.random.shuffle(records)
示例#3
0
data_path = './sample.csv'
title, contents = tool.loading_data(data_path,
                                    eng=False,
                                    num=False,
                                    punc=False)  # 트레이닝 data read
# test_title, test_content = tool.loading_data("new_simple.csv", eng=False, num=False, punc=False)  # 테스트 data read
test_content = tool.loading_test_data("test.csv",
                                      eng=False,
                                      num=False,
                                      punc=False)  # 테스트 data read
test_title = []
for i in range(len(test_content)):  # teset_ title은 예측을 하는것이기 때문에 빈 값을 넣어줍니다.
    test_title.append("")
word_to_ix, ix_to_word = tool.make_dict_all_cut(
    title + contents + test_content,
    minlength=0,
    maxlength=3,
    jamo_delete=True)  # 단어들을 인덱스화 합니다.(워드에서 인덱스 딕셔너리, 인덱스에서 워드 딕셔너리)
input_title_content = title + contents + test_content
# 단어 벡터를 분석해볼 임의의 문장들
# 문장을 전부 합친 후 공백으로 단어들을 나누고 고유한 단어들로 리스트를 만듭니다.

# konlp 객체 생성
print("konlp로 명사 추출하는중 입니다")
kkma = Kkma()
# 명사만 뽑은 단어 사전을  추가로 해서 word2vec을 함으로써 단어 임베딩의 효과를 높힘

a = []
for i in input_title_content:
    for word in kkma.nouns(i):
        a.append(word)  # word2vec 단어 사전을 추가해준다.
示例#4
0
    agony = ["외모", "가족", "학업", "취업", "직장생활", "진로", "친구", "이성", "이웃", "성격"]
    contents = []
    title = []
    count = 0
    result = []
    for i in range(len(agony)):
        f = open(agony[i] + "_v7.txt", 'r')
        lines = f.readlines()
        for line in lines:
            contents.append(line.split("\n")[0])
        for line in lines:
            title.append(agony[i])
            result.append([i])

    word_to_ix, ix_to_word = tool.make_dict_all_cut(contents,
                                                    0,
                                                    3,
                                                    jamo_delete=True)
    encoder_size = 100
    decoder_size = 3
    encoderinputs, decoderinputs, targets_, targetweights = \
    tool.make_inputs(contents, title, word_to_ix,
                     encoder_size=encoder_size, decoder_size=decoder_size, shuffle=False)

    mix = []
    for i in range(len(encoderinputs)):
        mix.append(encoderinputs[i] + result[i])

    train, test = train_test_split(mix)
    train_input = []
    train_output = []
    test_input = []