import tensorflow as tf import numpy as np import tool as tool import time # data loading data_path = './newscorpus.csv' title, contents = tool.loading_data(data_path, eng=False, num=False, punc=False) test_title, test_content = tool.loading_data("sample.csv", eng=False, num=False, punc=False) for i in range(len(test_title)): test_title[i] = "" word_to_ix, ix_to_word = tool.make_dict_all_cut(title + contents + test_content, minlength=0, maxlength=3, jamo_delete=True) # parameters multi = True forward_only = False hidden_size = 300 vocab_size = len(ix_to_word) num_layers = 3 learning_rate = 0.001 batch_size = 16 encoder_size = 100 decoder_size = tool.check_doclength(title, sep=True) # (Maximum) number of time steps in this batch steps_per_checkpoint = 20 # transform data encoderinputs, decoderinputs, targets_, targetweights = \ tool.make_inputs(contents, title, word_to_ix, encoder_size=encoder_size, decoder_size=decoder_size, shuffle=False) test_encoderinputs, test_decoderinputs, test_targets_, test_targetweights = \
result.append([3]) elif a == "직장생활": result.append([4]) elif a == "진로": result.append([5]) elif a == "친구": result.append([6]) elif a == "이성": result.append([7]) elif a == "이웃": result.append([8]) elif a == "성격": result.append([9]) word_to_ix, ix_to_word = tool.make_dict_all_cut(contents, minlength=0, maxlength=3, jamo_delete=True) encoder_size = 100 decoder_size = 3 encoderinputs, decoderinputs, targets_, targetweights = \ tool.make_inputs(contents, title, word_to_ix, encoder_size=encoder_size, decoder_size=decoder_size, shuffle=False) # encoderinputs & result mix = [] for i in range(len(encoderinputs)): mix.append(encoderinputs[i] + result[i]) def train_test_split(records, testratio=0.2): np.random.seed(int(len(records) / 2)) np.random.shuffle(records)
data_path = './sample.csv' title, contents = tool.loading_data(data_path, eng=False, num=False, punc=False) # 트레이닝 data read # test_title, test_content = tool.loading_data("new_simple.csv", eng=False, num=False, punc=False) # 테스트 data read test_content = tool.loading_test_data("test.csv", eng=False, num=False, punc=False) # 테스트 data read test_title = [] for i in range(len(test_content)): # teset_ title은 예측을 하는것이기 때문에 빈 값을 넣어줍니다. test_title.append("") word_to_ix, ix_to_word = tool.make_dict_all_cut( title + contents + test_content, minlength=0, maxlength=3, jamo_delete=True) # 단어들을 인덱스화 합니다.(워드에서 인덱스 딕셔너리, 인덱스에서 워드 딕셔너리) input_title_content = title + contents + test_content # 단어 벡터를 분석해볼 임의의 문장들 # 문장을 전부 합친 후 공백으로 단어들을 나누고 고유한 단어들로 리스트를 만듭니다. # konlp 객체 생성 print("konlp로 명사 추출하는중 입니다") kkma = Kkma() # 명사만 뽑은 단어 사전을 추가로 해서 word2vec을 함으로써 단어 임베딩의 효과를 높힘 a = [] for i in input_title_content: for word in kkma.nouns(i): a.append(word) # word2vec 단어 사전을 추가해준다.
agony = ["외모", "가족", "학업", "취업", "직장생활", "진로", "친구", "이성", "이웃", "성격"] contents = [] title = [] count = 0 result = [] for i in range(len(agony)): f = open(agony[i] + "_v7.txt", 'r') lines = f.readlines() for line in lines: contents.append(line.split("\n")[0]) for line in lines: title.append(agony[i]) result.append([i]) word_to_ix, ix_to_word = tool.make_dict_all_cut(contents, 0, 3, jamo_delete=True) encoder_size = 100 decoder_size = 3 encoderinputs, decoderinputs, targets_, targetweights = \ tool.make_inputs(contents, title, word_to_ix, encoder_size=encoder_size, decoder_size=decoder_size, shuffle=False) mix = [] for i in range(len(encoderinputs)): mix.append(encoderinputs[i] + result[i]) train, test = train_test_split(mix) train_input = [] train_output = [] test_input = []