def run_distilibert(strategy: tf.distribute.TPUStrategy, x_train: np.array, x_valid: np.array, _y_train: np.array, y_valid: np.array, train_dataset: tf.data.Dataset, valid_dataset: tf.data.Dataset, test_dataset: tf.data.Dataset, max_len: int, epochs: int, batch_size: int) -> tf.keras.models.Model: """ create and run distilbert on training and testing data """ logger.info('build distilbert') with strategy.scope(): transformer_layer = TFDistilBertModel.from_pretrained(MODEL) model = build_model(transformer_layer, max_len=max_len) model.summary() # train given model n_steps = x_train.shape[0] // batch_size history = model.fit(train_dataset, steps_per_epoch=n_steps, validation_data=valid_dataset, epochs=epochs) plot_train_val_loss(history, 'distilbert') n_steps = x_valid.shape[0] // batch_size _train_history_2 = model.fit(valid_dataset.repeat(), steps_per_epoch=n_steps, epochs=epochs * 2) scores = model.predict(test_dataset, verbose=1) logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}") return model
def simple_rnn(strategy: tf.distribute.TPUStrategy, x_train_padded: np.array, x_valid_padded: np.array, y_train: np.array, y_valid: np.array, max_len: int, embedding_size_x: int, embedding_size_y: int, epochs: int) -> tf.keras.models.Sequential: """ create and run simple rnn on training and testing data """ logger.info('build simple RNN') with strategy.scope(): # A simpleRNN without any pretrained embeddings and one dense layer model = tf.keras.models.Sequential() model.add(tf.keras.layers.Embedding(embedding_size_x, embedding_size_y, input_length=max_len)) model.add(tf.keras.layers.SimpleRNN(100)) model.add(tf.keras.layers.Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() # run model train history = model.fit(x_train_padded, y_train, epochs=epochs, batch_size=64 * strategy.num_replicas_in_sync) plot_train_val_loss(history, 'simple_rnn') scores = model.predict(x_valid_padded) logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}") return model
def run_gru(strategy: tf.distribute.TPUStrategy, x_train_padded: np.array, x_valid_padded: np.array, y_train: np.array, y_valid: np.array, max_len: int, embedding_size_x: int, embedding_size_y: int, embedding_matrix: np.array, epochs: int) -> tf.keras.models.Sequential: """ create and run gru on training and testing data """ logger.info('build gru') # build GRU model with strategy.scope(): # GRU with glove embeddings and dense layer model = tf.keras.models.Sequential() model.add(tf.keras.layers.Embedding(embedding_size_x, embedding_size_y, weights=[embedding_matrix], input_length=max_len, trainable=False)) model.add(tf.keras.layers.SpatialDropout1D(0.3)) model.add(tf.keras.layers.GRU(embedding_size_y)) model.add(tf.keras.layers.Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() history = model.fit(x_train_padded, y_train, epochs=epochs, batch_size=64 * strategy.num_replicas_in_sync) plot_train_val_loss(history, 'gru') scores = model.predict(x_valid_padded) logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}") return model
def run_rnn(strategy: tf.distribute.TPUStrategy, x_train_padded: np.array, x_valid_padded: np.array, y_train: np.array, y_valid: np.array, max_len: int, embedding_size_x: int, embedding_size_y: int, embedding_matrix: np.array, epochs: int) -> tf.keras.models.Sequential: """ create and run bidirectional rnn on training and testing data """ logger.info('build rnn') with strategy.scope(): # A simple bidirectional LSTM with glove embeddings and one dense layer model = tf.keras.models.Sequential() model.add( tf.keras.layers.Embedding(embedding_size_x, embedding_size_y, weights=[embedding_matrix], input_length=max_len, trainable=False)) model.add( tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(embedding_size_y, dropout=0.3, recurrent_dropout=0.3))) model.add(tf.keras.layers.Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() # fit rnn model, plot the training output history = model.fit(x_train_padded, y_train, epochs=epochs, batch_size=64 * strategy.num_replicas_in_sync) plot_train_val_loss(history, 'bidirectional_lstm') scores = model.predict(x_valid_padded) logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}") return model
def train_test( clean_data: pd.DataFrame ) -> Tuple[tf.keras.models.Sequential, tf.keras.models.Sequential]: """ train test creates the tensorflow models for sentiment analysis """ logger.info('run training and testing for lstm and cnn') all_reviews: List[str] = clean_data[review_key] labels: List[int] = clean_data[class_key] train_reviews, test_reviews, train_labels, test_labels = train_test_split( all_reviews, labels, test_size=TEST_SIZE) train_reviews, validation_reviews, train_labels, validation_labels = train_test_split( train_reviews, train_labels, test_size=VALIDATION_SIZE) # create the training, validation, and testing datasets training_dataset = tf.data.Dataset.from_tensor_slices( (train_reviews, train_labels)) validation_dataset = tf.data.Dataset.from_tensor_slices( (validation_reviews, validation_labels)) testing_dataset = tf.data.Dataset.from_tensor_slices( (test_reviews, test_labels)) # buffer size is used to shuffle the dataset buffer_size = 10000 training_dataset = training_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) validation_dataset = validation_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) testing_dataset = testing_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) # print some samples logger.success('training data sample:') for input_example, target_example in training_dataset.take(1): logger.info(f"\ninput: {input_example}\ntarget: {target_example}") vocab_size = 10000 sequence_length = 250 vectorize_layer = TextVectorization(standardize=standardize_text, max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length) autotune = tf.data.experimental.AUTOTUNE training_dataset = training_dataset.cache().prefetch(buffer_size=autotune) train_text = training_dataset.map(lambda x, y: x) vectorize_layer.adapt(train_text) embedding_dim = 16 embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim) output_layer = tf.keras.layers.Dense(1, activation='sigmoid') # create lstm model lstm_model = tf.keras.models.Sequential([ vectorize_layer, embedding_layer, tf.keras.layers.LSTM(256), tf.keras.layers.Dense(64, activation='relu'), output_layer, ]) learning_rate: int = 1e-3 optimizer = tf.keras.optimizers.Adam(learning_rate) loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) # save accuracy after each epoch metrics = [tf.keras.metrics.BinaryAccuracy()] lstm_model.compile(optimizer=optimizer, loss=loss, metrics=metrics) _lstm_callbacks = [UseMaxWeights()] # tried setting callbacks in the fit function below to lstm_callbacks # to use the max of all hidden states as the context vector for prediction # it did not work as well as using the last hidden state, so I am # not using the callback # run training and save history hist = lstm_model.fit(training_dataset, epochs=12, callbacks=[], validation_data=validation_dataset) plot_train_val_loss(hist, 'reviews_lstm') logger.info('lstm model summary:') lstm_model.summary() loss_metric, accuracy = lstm_model.evaluate(testing_dataset) logger.info(f'loss: {loss_metric}, accuracy: {accuracy}') classes = range(2) # don't print these metrics for equally-sized datasets # precision, recall, f_score, support = get_precision_recall_fscore( # lstm_model, testing_dataset, classes) # for i in classes: # logger.info(f'{reviews_class_map[i]}:') # logger.info(f'precision: {precision[i]}') # logger.info(f'recall: {recall[i]}') # logger.info(f'f-score: {f_score[i]}') # logger.info(f'support: {support[i]}') # create cnn model cnn_model = tf.keras.models.Sequential([ vectorize_layer, embedding_layer, tf.keras.layers.ZeroPadding1D(1), tf.keras.layers.Conv1D(32, 2, activation='relu'), tf.keras.layers.MaxPooling1D(2), tf.keras.layers.Conv1D(64, 3, activation='relu'), tf.keras.layers.MaxPooling1D(3), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), output_layer, ]) cnn_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # save cnn model history hist = cnn_model.fit(training_dataset, epochs=16, callbacks=[], validation_data=validation_dataset) plot_train_val_loss(hist, 'reviews_cnn') logger.info('cnn model summary') cnn_model.summary() # print the loss and accuracy at the end on the testing dataset loss_metric, accuracy = cnn_model.evaluate(testing_dataset) logger.info(f'loss: {loss_metric}, accuracy: {accuracy}') # precision, recall, f_score, support = get_precision_recall_fscore( # cnn_model, testing_dataset, classes) # for i in classes: # logger.info(f'{reviews_class_map[i]}:') # logger.info(f'precision: {precision[i]}') # logger.info(f'recall: {recall[i]}') # logger.info(f'f-score: {f_score[i]}') # logger.info(f'support: {support[i]}') return lstm_model, cnn_model
def train_test( clean_data: pd.DataFrame, label_list: List[BookType] ) -> Tuple[tf.keras.models.Sequential, tf.keras.models.Sequential]: """ train test run training and testing for book classification """ logger.info('run training and testing for lstm and cnn') all_paragraphs: List[str] = [ ' '.join(paragraph) for paragraph in clean_data[paragraph_key] ] labels: List[int] = clean_data[class_key] train_paragraphs, test_paragraphs, train_labels, test_labels = train_test_split( all_paragraphs, labels, test_size=TEST_SIZE) train_paragraphs, validation_paragraphs, train_labels, validation_labels = train_test_split( train_paragraphs, train_labels, test_size=VALIDATION_SIZE) # create training, validation and testing datasets training_dataset = tf.data.Dataset.from_tensor_slices( (train_paragraphs, train_labels)) validation_dataset = tf.data.Dataset.from_tensor_slices( (validation_paragraphs, validation_labels)) testing_dataset = tf.data.Dataset.from_tensor_slices( (test_paragraphs, test_labels)) # buffer size is used to shuffle the dataset buffer_size = 10000 # shuffle and batch datasets training_dataset = training_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) validation_dataset = validation_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) testing_dataset = testing_dataset.shuffle(buffer_size).batch( batch_size, drop_remainder=True) # print some samples logger.success('training data sample:') for input_example, target_example in training_dataset.take(1): logger.info(f"\ninput: {input_example}\ntarget: {target_example}") vocab_size = 10000 sequence_length = 250 # decided not to use pre-trained embedding layer (https://www.tensorflow.org/hub/tutorials/cord_19_embeddings_keras#training_a_citaton_intent_classifier) # because I already wrote this text vectorizor system, and did not want to redo # the same work. I tried using the universal sentence encoder from tf-hub: # https://tfhub.dev/google/universal-sentence-encoder/4 # the proof of concept worked well, but implementing it would require changing # all of the code below. # create vectorization layer vectorize_layer = TextVectorization(standardize=standardize_text, max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length) autotune = tf.data.experimental.AUTOTUNE training_dataset = training_dataset.cache().prefetch(buffer_size=autotune) train_text = training_dataset.map(lambda x, y: x) # adapt vectorization layer to text input vectorize_layer.adapt(train_text) embedding_dim = 16 embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim) num_classes: int = len(label_list) output_layer = tf.keras.layers.Dense(num_classes) # create lstm model lstm_model = tf.keras.models.Sequential([ vectorize_layer, embedding_layer, tf.keras.layers.LSTM(128), tf.keras.layers.Dense(64, activation='relu'), output_layer, ]) learning_rate: int = 1e-3 # create optimizer and loss function optimizer = tf.keras.optimizers.Adam(learning_rate) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # save accuracy metric on each epoch metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] lstm_model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # optional callback, described below _lstm_callbacks = [UseMaxWeights()] # tried setting callbacks in the fit function below to lstm_callbacks # to use the max of all hidden states as the context vector for prediction # it did not work as well as using the last hidden state, so I am # not using the callback # train model and save history hist = lstm_model.fit(training_dataset, epochs=12, callbacks=[], validation_data=validation_dataset) # plot training loss plot_train_val_loss(hist, 'books_lstm') # print model summary logger.info('lstm model summary:') lstm_model.summary() loss_metric, accuracy = lstm_model.evaluate(testing_dataset) logger.info(f'loss: {loss_metric}, accuracy: {accuracy}') classes = range(num_classes) precision, recall, f_score, support = get_precision_recall_fscore( lstm_model, testing_dataset, classes) for i in classes: current_book = label_list[i] logger.info(f'{class_map[current_book]}:') logger.info(f'precision: {precision[i]}') logger.info(f'recall: {recall[i]}') logger.info(f'f-score: {f_score[i]}') logger.info(f'support: {support[i]}') cnn_model = tf.keras.models.Sequential([ vectorize_layer, embedding_layer, tf.keras.layers.ZeroPadding1D(1), tf.keras.layers.Conv1D(32, 3, activation='relu'), tf.keras.layers.MaxPooling1D(3), tf.keras.layers.Conv1D(24, 2, activation='relu'), tf.keras.layers.MaxPooling1D(2), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), output_layer, ]) cnn_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) hist = cnn_model.fit(training_dataset, epochs=14, callbacks=[], validation_data=validation_dataset) plot_train_val_loss(hist, 'books_cnn') logger.info('cnn model summary') cnn_model.summary() loss_metric, accuracy = cnn_model.evaluate(testing_dataset) logger.info(f'loss: {loss_metric}, accuracy: {accuracy}') precision, recall, f_score, support = get_precision_recall_fscore( cnn_model, testing_dataset, classes) for i in classes: current_book = label_list[i] logger.info(f'{class_map[current_book]}:') logger.info(f'precision: {precision[i]}') logger.info(f'recall: {recall[i]}') logger.info(f'f-score: {f_score[i]}') logger.info(f'support: {support[i]}') return lstm_model, cnn_model
logger.info('') loss = torch.nn.MSELoss(reduction='sum') # alternative loss: MAE torch.nn.L1Loss(reduction='sum') # MAE best_val_loss = math.inf best_epoch = 0 logger.info("Starting training...") for epoch in range(1, args.n_epochs): train_loss = train(model, train_loader, optimizer, loss, device, scheduler, logger if args.verbose else None) logger.info("Epoch {}: Training Loss {}".format(epoch, train_loss)) val_loss = test(model, val_loader, loss, device, log_dir, epoch) logger.info("Epoch {}: Validation Loss {}".format(epoch, val_loss)) if scheduler and not isinstance(scheduler, NoamLR): scheduler.step(val_loss) if val_loss <= best_val_loss: best_val_loss = val_loss best_epoch = epoch # torch.save(model.state_dict(), os.path.join(log_dir, 'best_model')) logger.info("Best Validation Loss {} on Epoch {}".format( best_val_loss, best_epoch)) log_file = os.path.join(log_dir, log_file_name + '.log') plot_train_val_loss(log_file)