print('샘플 크기: \n', len(sentences)) print('0번째 샘플 단어 시퀀스: \n', sentences[0]) print('0번째 샘플 bio 태그: \n', tags[0]) print('샘플 단어 시퀀스 최대 길이: ', max(len(l) for l in sentences)) print('샘플 단어 시퀀스 평균 길이: ', (sum(map(len, sentences)) / len(sentences))) tag_tokenizer = preprocessing.text.Tokenizer(lower=False) tag_tokenizer.fit_on_texts(tags) vocab_size = len(p.word_index) + 1 tag_size = len(tag_tokenizer.word_index) + 1 print('BIO 태그 사전 크기: ', tag_size) print('단어 사전 크기: ', vocab_size) x_train = [p.get_wordidx_sequence(sent) for sent in sentences] y_train = tag_tokenizer.texts_to_sequences(tags) index_to_ner = tag_tokenizer.index_word index_to_ner[0] = 'PAD' max_len = 40 x_train = preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=max_len) y_train = preprocessing.sequence.pad_sequences(y_train, padding='post', maxlen=max_len) x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate train_file = 'total_train_data.csv' data = pd.read_csv(train_file, delimiter=',') queries = data['query'].tolist() intents = data['intent'].tolist() from utils.preprocess import Preprocess p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin') sequences = [] for sentence in queries: pos = p.pos(sentence) keywords = p.get_keywords(pos, without_tag=True) seq = p.get_wordidx_sequence(keywords) sequences.append(seq) from config.globalparams import MAX_SEQ_LEN padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post') ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents)) ds = ds.shuffle(len(queries)) train_size = int(len(padded_seqs) * 0.7) val_size = int(len(padded_seqs) * 0.2) test_size = int(len(padded_seqs) * 0.1) train_ds = ds.take(train_size).batch(32) val_ds = ds.skip(train_size).take(val_size).batch(32) test_ds = ds.skip(train_size + val_size).take(test_size).batch(32)
import sys sys.path.append(".") import pickle from utils.preprocess import Preprocess f = open("./train/chatbot_bin.bin", "rb") word_index = pickle.load(f) f.close() sent = "갑자기 짜장면 먹고 싶네 ㅋㅋ" p = Preprocess("./train/chatbot_bin.bin") pos = p.pos(sent) keywords = p.get_keywords(pos, without_tag=True) print(p.word_index) print(p.get_wordidx_sequence(keywords)) for word in keywords: try: print(word, word_index[word]) except KeyError: print(word, word_index["OOV"])