Exemplo n.º 1
0
print('샘플 크기: \n', len(sentences))
print('0번째 샘플 단어 시퀀스: \n', sentences[0])
print('0번째 샘플 bio 태그: \n', tags[0])
print('샘플 단어 시퀀스 최대 길이: ', max(len(l) for l in sentences))
print('샘플 단어 시퀀스 평균 길이: ', (sum(map(len, sentences)) / len(sentences)))

tag_tokenizer = preprocessing.text.Tokenizer(lower=False)
tag_tokenizer.fit_on_texts(tags)

vocab_size = len(p.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1
print('BIO 태그 사전 크기: ', tag_size)
print('단어 사전 크기: ', vocab_size)

x_train = [p.get_wordidx_sequence(sent) for sent in sentences]
y_train = tag_tokenizer.texts_to_sequences(tags)

index_to_ner = tag_tokenizer.index_word
index_to_ner[0] = 'PAD'

max_len = 40
x_train = preprocessing.sequence.pad_sequences(x_train,
                                               padding='post',
                                               maxlen=max_len)
y_train = preprocessing.sequence.pad_sequences(y_train,
                                               padding='post',
                                               maxlen=max_len)

x_train, x_test, y_train, y_test = train_test_split(x_train,
                                                    y_train,
Exemplo n.º 2
0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

train_file = 'total_train_data.csv'
data = pd.read_csv(train_file, delimiter=',')
queries = data['query'].tolist()
intents = data['intent'].tolist()

from utils.preprocess import Preprocess
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin')

sequences = []
for sentence in queries:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

from config.globalparams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents))
ds = ds.shuffle(len(queries))

train_size = int(len(padded_seqs) * 0.7)
val_size = int(len(padded_seqs) * 0.2)
test_size = int(len(padded_seqs) * 0.1)

train_ds = ds.take(train_size).batch(32)
val_ds = ds.skip(train_size).take(val_size).batch(32)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(32)
Exemplo n.º 3
0
import sys

sys.path.append(".")
import pickle
from utils.preprocess import Preprocess

f = open("./train/chatbot_bin.bin", "rb")
word_index = pickle.load(f)
f.close()

sent = "갑자기 짜장면 먹고 싶네 ㅋㅋ"

p = Preprocess("./train/chatbot_bin.bin")
pos = p.pos(sent)
keywords = p.get_keywords(pos, without_tag=True)

print(p.word_index)
print(p.get_wordidx_sequence(keywords))
for word in keywords:
    try:
        print(word, word_index[word])
    except KeyError:
        print(word, word_index["OOV"])