def prepareData(self, processed_features, labels): # Split train & test x_train, x_test, y_train, y_test = train_test_split( processed_features, labels, test_size=self.TEST_SIZE, random_state=1) # Tokenize and transform to integer index tokenizer = Tokenizer(num_words=self.NUM_WORDS) tokenizer.fit_on_texts(x_train) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) self.INPUT_LENGTH = find_max_length(x_train) x_train = pad_sequences(x_train, maxlen=self.INPUT_LENGTH) x_test = pad_sequences(x_test, maxlen=self.INPUT_LENGTH) # Adding 1 because of reserved 0 index self.VOCAB_SIZE = len(tokenizer.word_index) + 1 return x_train, x_test, y_train, y_test
def ft_model(self, processed_features, labels, embedding_matrix, vocab_size, model): classes_num = self.dataset.getParameters()["classes_num"] X_train, X_test, y_train, y_test = train_test_split( processed_features, labels, test_size=self.TEST_SIZE, random_state=0) tokenizer = Tokenizer(num_words=self.NUM_WORDS) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) lenth = find_max_length(X_train) # vocab_size = len(tokenizer.word_index) + 1 X_train = pad_sequences(X_train, padding='post', maxlen=lenth) X_test = pad_sequences(X_test, padding='post', maxlen=lenth) model = Sequential() model.add( Embedding(vocab_size, 64, input_length=lenth, weights=[embedding_matrix], trainable=True)) model.add(Dropout(0.5)) model.add(Conv1D(128, 5, activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(classes_num, activation=self.ACTIVATION)) model.compile(optimizer='adam', loss=self.LOSSFUNC, metrics=['accuracy']) es_callback = EarlyStopping(monitor='val_loss', patience=4) model.summary() history = model.fit(X_train, y_train, epochs=self.EPOCHS, verbose=1, validation_data=(X_test, y_test), batch_size=self.BATCH_SIZE, callbacks=[es_callback]) loss, accuracy = model.evaluate(X_train, y_train, verbose=1) print("Training Accuracy: {:.4f}".format(accuracy)) loss, accuracy = model.evaluate(X_test, y_test, verbose=1) print("Testing Accuracy: {:.4f}".format(accuracy)) return history
def ol_model(self, processed_features, labels): # print(processed_features) # print(labels) classes_num = self.dataset.getParameters()["classes_num"] X_train, X_test, y_train, y_test = train_test_split( processed_features, labels, test_size=self.TEST_SIZE, random_state=0) tokenizer = Tokenizer(num_words=self.NUM_WORDS) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) lenth = find_max_length(X_train) vocab_size = len(tokenizer.word_index) + 1 X_train = pad_sequences(X_train, padding='post', maxlen=lenth) X_test = pad_sequences(X_test, padding='post', maxlen=lenth) model = Sequential() model.add(Embedding(vocab_size, 64, input_length=lenth)) model.add(Flatten()) # model.add(Dense(4, activation='relu')) # model.add(Dropout(0.4)) model.add(Dense(classes_num, activation=self.ACTIVATION)) model.compile(loss=self.LOSSFUNC, optimizer='adam', metrics=['accuracy']) es_callback = EarlyStopping(monitor='val_loss', patience=3) model.summary() history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=self.EPOCHS, batch_size=self.BATCH_SIZE, verbose=1, callbacks=[es_callback]) predicted_sentiment = model.predict(X_test) scores = model.evaluate(X_test, y_test, verbose=1) print("Accuracy: %.2f%%" % (scores[1] * 100)) return history