class DLModel(BenchmarkedModel): def __init__(self): super().__init__() max_features = 1024 model = Sequential() model.add(Embedding(max_features, output_dim=256)) model.add(LSTM(128)) model.add(Dropout(0.5)) model.add(Dense(1, activation="sigmoid")) model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"]) self.clf = model self.vectorizer = Tokenizer() def fit(self, data, labels): self.vectorizer.fit_on_texts(data) processed_data = self.vectorizer.texts_to_matrix(data, mode="count") self.clf.fit(processed_data, labels, batch_size=16, epochs=10) def predict(self, data): processed_data = self.vectorizer.texts_to_matrix(data, mode="count") self.clf.predict(processed_data)
def model(classes): tokenizer = Tokenizer() train_documents = list() train_labels = list() test_documents = list() test_labels = list() for c in classes: train_documents += c[0] train_labels += c[1] test_documents += c[2] test_labels += c[3] train_labels = to_categorical(train_labels, num_classes) test_labels = to_categorical(test_labels, num_classes) tokenizer.fit_on_texts(train_documents) train = tokenizer.texts_to_matrix(train_documents, "tfidf") test = tokenizer.texts_to_matrix(test_documents, "tfidf") n_words = test.shape[1] model = Sequential() model.add(Dense(50, input_shape=(n_words,), activation='relu')) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(train, train_labels, epochs=epochs, verbose=verbose) loss, acc = model.evaluate(test, test_labels, verbose=verbose) print(str(acc * 100))
def one_hot_word_with_keras(): tokenizer = Tokenizer(num_words = 1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.texts_to_sequences(samples) one_hot_results = tokenizer.texts_to_matrix(samples, mode = 'binary') word_index = tokenizer.word_index print("Found {} unique tokens.".format(len(word_index)))
def build_matrix(self): """ Transform the data frame to a matrix for the CNN training. :return: matrix for: x_train, x_test, y_train, y_test """ self.lb_make = LabelEncoder() self.lb_make.fit(self.Y_train) tokenizer = Tokenizer(num_words=2000) x_array_train = numpy.asarray(self.train['text']) x_array_test = numpy.asarray(self.test['text']) tokenizer.fit_on_texts(x_array_train) x_train_matrix = tokenizer.texts_to_matrix(x_array_train, mode='count') x_test_matrix = tokenizer.texts_to_matrix(x_array_test, mode='count') y_train_numbers = self.lb_make.transform(self.Y_train) y_test_numbers = self.lb_make.transform(self.Y_test) y_train_matrix = keras.utils.to_categorical(y_train_numbers, 3) y_test_matrix = keras.utils.to_categorical(y_test_numbers, 3) self.tokenizer = tokenizer return x_train_matrix, x_test_matrix, y_train_matrix, y_test_matrix
def get_data_as_one_hot(num_words, data_location='data/data', labels_location='data/labels'): data, labels = read_data_and_labels(data_location, labels_location) tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(data) one_hot = tokenizer.texts_to_matrix(data, mode='binary') encoded_labels = np.asarray(labels).astype('float32') print('Returning encoded text, labels and tokenizer') return one_hot, encoded_labels, tokenizer
class ArticleThemeTokenizer: ''' List of themes in the same order as in the tokenizer, which corresponds as well as the index of theme in the prediction ''' orderedThemes: List[str] themes_count: int tokenizer: Tokenizer def __init__(self, articles: Articles): self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(articles.themes()) self.one_hot_matrix = self.tokenizer.texts_to_matrix(articles.themes()) # Remove the first column, whose first col contains only 0s. self.one_hot_matrix = np.delete(arr=self.one_hot_matrix, obj=0, axis=1) # Create ordered list of theme as in tokenizer self.orderedThemes: List[str] = [] for i in range(1, len(self.tokenizer.word_index) + 1): # word_index start at 1, 0 is reserved. self.orderedThemes.append(self.tokenizer.index_word[i]) self.themes_count = len(self.tokenizer.word_index) def index_of_theme(self, theme: str): return self.tokenizer.word_index[theme] - 1 def theme_at_index(self, index: int): return self.tokenizer.index_word[index + 1] def boolean_vector_to_themes(self, prediction_vector: List[bool]) -> List[str]: themes: List[str] = [] for idx in range(0, len(prediction_vector)): if prediction_vector[idx]: # +1 because the first index (0) is reserved by default. themes.append(self.tokenizer.index_word[idx + 1]) return themes def save(self, path: str): tokenizer_json = self.tokenizer.to_json() with io.open(path, 'w', encoding='utf-8') as f: f.write(tokenizer_json)
# %% [markdown] from keras_preprocessing.text import Tokenizer samples = [ "the cat sat on the mat", "the dog ate my homework", "the the ate ate dog dog", "가 나 다", ] tokenizer = Tokenizer(num_words=1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.texts_to_sequences(samples) one_hot_results = tokenizer.texts_to_matrix(samples, mode="binary") word_index = tokenizer.word_index print("%s 개의 토큰" % len(word_index)) word_index # %% sequences # %% one_hot_results.shape # %% one_hot_results[:10, :10] # %% samples = ["그 고양이는 맽 위에 앉았다", "그 개는 숙제를 먹었다"] tokenizer = Tokenizer(num_words=1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.texts_to_sequences(samples)
print(token_index) print() # Tokenizer 로 분리 tokenizer = Tokenizer() tokenizer.fit_on_texts(samples) token_seq = tokenizer.texts_to_sequences(samples) #텍스트 정수 인덱싱 print(token_seq) print() token_mat = tokenizer.texts_to_matrix(samples, mode='binary')#2진 mode ='binary','count','tfidf' print(token_mat) word_index = tokenizer.word_index print(word_index) print('found %s unique tokens'%(len(word_index))) #found 9 unique tokens print(tokenizer.word_counts) print(tokenizer.document_count) print(tokenizer.word_docs) print() docs = [ '먼저 ㅓ텍스트의 각 단어를 나누어 토큰화 한다.' '텍스트의 단어로 토큰화 해야 딥러닝에서 인식된다.', '토큰화 한 결 과는 딥러닝에서 사용할수 있다' ]
from sklearn import preprocessing from sklearn.model_selection import train_test_split from keras.preprocessing.sequence import pad_sequences import matplotlib.pyplot as plt from sklearn.datasets import fetch_20newsgroups from tensorflow.python.keras.models import Sequential newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True) sentences = newsgroups_train.data y = newsgroups_train.target tokenizer = Tokenizer(num_words=2000) tokenizer.fit_on_texts(sentences) max_len = max([len(s.split()) for s in sentences]) vocab_len = len(tokenizer.word_index) + 1 sentences = tokenizer.texts_to_matrix(sentences) padded_docs = pad_sequences(sentences, maxlen=max_len) le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.25, random_state=0) model = Sequential() model.add(layers.Dense(300, input_dim=11821, activation='relu')) model.add(layers.Dense(20, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
Character = data['Character'].values.tolist() Dialogue = data['Dialogue'].values.tolist() tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(Dialogue) for i in range(len(Dialogue)): if Character[i] in character_dialog: character_dialog[Character[i]].append(Dialogue[i]) else: character_dialog[Character[i]] = [] character_dialog[Character[i]].append(Dialogue[i]) for key, value in character_dialog.items(): random.shuffle(value, random.random) value = value[:6664] value = tokenizer.texts_to_matrix(value) dataX.extend(value) for i in range(len(value)): dataY.append(key) c = list(zip(dataX, dataY)) random.shuffle(c) dataX, dataY = zip(*c) le = LabelBinarizer() dataY = le.fit_transform(dataY) dataX = numpy.array(dataX) dataY = numpy.array(dataY) model = Sequential() model.add(Dense(64, input_shape=(len(dataX[0]),), activation='relu')) model.add(Dense(32, activation='relu'))
Character = data['Character'].values.tolist() Dialogue = data['Dialogue'].values.tolist() for i in range(len(Dialogue)): if Character[i] in character_dialog: character_dialog[Character[i]].append(Dialogue[i]) else: character_dialog[Character[i]] = [] character_dialog[Character[i]].append(Dialogue[i]) for key, value in character_dialog.items(): random.shuffle(value, random.random) value = value[:5000] for i in range(len(value)): if len(value[i]) > 10: dataX.append(tokenizer_name.texts_to_matrix(list(prepare_word(value[i], n=100)))) dataY.append(key) c = list(zip(dataX, dataY)) random.shuffle(c) dataX, dataY = zip(*c) le = LabelBinarizer() dataY = le.fit_transform(dataY) dataX = numpy.array(dataX) dataY = numpy.array(dataY) print(dataX.shape) model = Sequential() model.add(Conv1D(512, 3, activation='relu', input_shape=(len(dataX[0]), len(dataX[0][0])))) model.add(Conv1D(512, 3, activation='relu')) model.add(MaxPooling1D(3)) model.add(Conv1D(256, 3, activation='relu'))
# Create tokenizer num_words_keep = 1000 tokenizer = Tokenizer(num_words=num_words_keep,filters='',lower=False,split=' ', char_level=False, oov_token=None) # Fit tokenizer on training data x_train = train.iloc[:,0] tokenizer.fit_on_texts(texts=x_train) modes = ['binary', 'count', 'tfidf', 'freq'] # Training data y_train = train.iloc[:,1] x_train = tokenizer.texts_to_matrix(x_train, mode=modes[1]) y_train = utils.to_categorical(y_train, num_classes=2) # Validation data x_validate = validate.iloc[:,0] x_validate = tokenizer.texts_to_matrix(x_validate, mode=modes[1]) y_validate = validate.iloc[:,1] y_validate = utils.to_categorical(y_validate, num_classes=2) # Test data x_test = test.iloc[:,0] x_test = tokenizer.texts_to_matrix(x_test, mode=modes[1]) y_test = test.iloc[:,1]
twenty_train = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'footers')) x, y = twenty_train.data, twenty_train.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=False) tokenizer = Tokenizer(num_words=10000) tokenizer.fit_on_texts(x_train) x_train = tokenizer.texts_to_matrix(x_train, mode='tfidf') x_test = tokenizer.texts_to_matrix(x_test, mode='tfidf') nb_classes = np.max(y_train) + 1 y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) print(nb_classes) # pre-processing: divide by max and substract mean input_dim = x_train.shape[1] # convert list of labels to binary class matrix reduce_percent = 0.8 loc = 'results/normal' save_dir = gen_save_dir(loc) # part_percentage = round(reduce_percent * len(X_train))