def classify_text(self, text): """ function to perform sentiment analysis text_classification :param text: text sent in/from written query to be analyzed """ sentimentInfo = self.models.get("text_classification") vocab = sentimentInfo["vocabulary"] # Clean up text text = lemmatize_text(text_clean_up([text])) # Encode text text = encode_text(vocab, text) text = sequence.pad_sequences(text, sentimentInfo["max_text_length"]) model = sentimentInfo["model"] prediction = tf.keras.backend.argmax(model.predict(text)) return sentimentInfo["classes"][tf.keras.backend.get_value(prediction)[0]]
def text_classification_query(self, instruction, drop=None, preprocess=True, label_column=None, test_size=0.2, random_state=49, learning_rate=1e-2, epochs=20, monitor="val_loss", batch_size=32, max_text_length=200, max_features=20000, generate_plots=True, save_model=False, save_path=os.getcwd()): """ function to apply text_classification algorithm for sentiment analysis :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. """ if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if max_text_length < 1: raise Exception("Max text length must be equal to or greater than 1") if save_model: if not os.path.exists(save_path): raise Exception("Save path does not exists") if test_size == 0: testing = False else: testing = True data = DataReader(self.dataset) data = data.data_generator() if preprocess: data.fillna(0, inplace=True) if drop is not None: data.drop(drop, axis=1, inplace=True) if label_column is None: label = "label" else: label = label_column X, Y, target = get_target_values(data, instruction, label) Y = np.array(Y) classes = np.unique(Y) logger("->", "Target Column Found: {}".format(target)) vocab = {} if preprocess: logger("Preprocessing data") X = lemmatize_text(text_clean_up(X.array)) vocab = X X = encode_text(X, X) X = np.array(X) model = get_keras_text_class(max_features, len(classes), learning_rate) logger("Building Keras LSTM model dynamically") X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=random_state) X_train = sequence.pad_sequences(X_train, maxlen=max_text_length) X_test = sequence.pad_sequences(X_test, maxlen=max_text_length) y_vals = np.unique(np.append(y_train, y_test)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i map_func = np.vectorize(lambda x: label_mappings[x]) y_train = map_func(y_train) y_test = map_func(y_test) logger("Training initial model") # early stopping callback es = EarlyStopping(monitor=monitor, mode='auto', verbose=0, patience=5) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs, callbacks=[es], verbose=0) logger( "->", "Final training loss: {}".format( history.history["loss"][len(history.history["loss"]) - 1])) if testing: logger( "->", "Final validation loss: {}".format( history.history["val_loss"][len(history.history["val_loss"]) - 1])) logger( "->", "Final validation accuracy: {}".format( history.history["val_accuracy"][ len(history.history["val_accuracy"]) - 1])) losses = { 'training_loss': history.history['loss'], 'val_loss': history.history['val_loss'] } accuracy = { 'training_accuracy': history.history['accuracy'], 'validation_accuracy': history.history['val_accuracy'] } else: logger("->", "Final validation loss: {}".format("0, No validation done")) losses = {'training_loss': history.history['loss']} accuracy = {'training_accuracy': history.history['accuracy']} plots = {} if generate_plots: # generates appropriate classification plots by feeding all # information logger("Generating plots") plots = generate_classification_plots(history, X, Y, model, X_test, y_test) if save_model: save(model, save_model, save_path=save_path) logger( "Storing information in client object under key 'text_classification'") # storing values the model dictionary self.models["text_classification"] = { "model": model, "classes": classes, "plots": plots, "target": Y, "vocabulary": vocab, "interpreter": label_mappings, "max_text_length": max_text_length, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': losses, 'accuracy': accuracy } clearLog() return self.models["text_classification"]
def summarization_query(self, instruction, preprocess=True, label_column=None, drop=None, epochs=5, batch_size=32, learning_rate=3e-5, max_text_length=512, gpu=False, test_size=0.2, random_state=49, generate_plots=True, save_model=False, save_path=os.getcwd()): ''' function to apply algorithm for text summarization :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if max_text_length < 2: raise Exception("Text and summary must be at least of length 2") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if max_text_length < 1: raise Exception("Max text length must be equal to or greater than 1") if save_model: if not os.path.exists(save_path): raise Exception("Save path does not exist") if test_size == 0: testing = False else: testing = True if gpu: if tf.test.gpu_device_name(): print('Default GPU Device: {}'.format(tf.test.gpu_device_name())) else: raise Exception("Please install GPU version of Tensorflow") device = '/device:GPU:0' else: device = '/device:CPU:0' tf.random.set_seed(random_state) np.random.seed(random_state) data = DataReader(self.dataset) data = data.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) if preprocess: data.fillna(0, inplace=True) logger("Preprocessing data...") if label_column is None: label = "summary" else: label = label_column tokenizer = T5Tokenizer.from_pretrained("t5-small") # Find target columns X, Y, target = get_target_values(data, instruction, label) logger("->", "Target Column Found: {}".format(target)) logger("Establishing dataset walkers") # Clean up text if preprocess: logger("Preprocessing data") X = add_prefix(lemmatize_text(text_clean_up(X.array)), "summarize: ") Y = add_prefix(lemmatize_text(text_clean_up(Y.array)), "summarize: ") # tokenize text/summaries X = tokenize_for_input_ids(X, tokenizer, max_text_length) Y = tokenize_for_input_ids(Y, tokenizer, max_text_length) logger('Fine-Tuning the model on your dataset...') # Suppress unnecessary output with NoStdStreams(): model = TFT5ForConditionalGeneration.from_pretrained( "t5-small", output_loading_info=False) if testing: X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=random_state) test_dataset = tf.data.Dataset.from_tensor_slices( (X_test, y_test)).shuffle(10000).batch(batch_size) else: X_train = X y_train = Y train_dataset = tf.data.Dataset.from_tensor_slices( (X_train, y_train)).shuffle(10000).batch(batch_size) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) total_training_loss = [] total_validation_loss = [] # Training Loop with tf.device(device): for epoch in range(epochs): total_loss = 0 total_loss_val = 0 for data, truth in train_dataset: with tf.GradientTape() as tape: out = model(inputs=data, decoder_input_ids=data) loss_value = loss(truth, out[0]) total_loss += loss_value grads = tape.gradient(loss_value, model.trainable_weights) optimizer.apply_gradients( zip(grads, model.trainable_weights)) total_training_loss.append(total_loss) # Validation Loop if testing: for data, truth in test_dataset: logits = model(inputs=data, decoder_input_ids=data, training=False) val_loss = loss(truth, logits[0]) total_loss_val += val_loss total_validation_loss.append(total_loss_val) logger( "->", "Final training loss: {}".format( str(total_training_loss[len(total_training_loss) - 1].numpy()))) if testing: total_loss_val_str = str( total_validation_loss[len(total_validation_loss) - 1].numpy()) else: total_loss_val = [0] total_loss_val_str = str("0, No validation done") logger("->", "Final validation loss: {}".format(total_loss_val_str)) if testing: losses = { "Training loss": total_training_loss[len(total_training_loss) - 1].numpy(), "Validation loss": total_validation_loss[len(total_validation_loss) - 1].numpy() } else: losses = { "Training loss": total_training_loss[len(total_training_loss) - 1].numpy() } plots = None if generate_plots: logger("Generating plots") plots = { "loss": libra.plotting.nonkeras_generate_plots.plot_loss( total_training_loss, total_validation_loss) } if save_model: logger("Saving model") model.save_weights(save_path + "summarization_checkpoint.ckpt") logger("Storing information in client object under key 'summarization'") self.models["summarization"] = { "model": model, "max_text_length": max_text_length, "plots": plots, "tokenizer": tokenizer, 'losses': losses } clearLog() return self.models["summarization"]