Пример #1
0
def run_distilibert(strategy: tf.distribute.TPUStrategy, x_train: np.array,
                    x_valid: np.array, _y_train: np.array, y_valid: np.array,
                    train_dataset: tf.data.Dataset,
                    valid_dataset: tf.data.Dataset,
                    test_dataset: tf.data.Dataset, max_len: int, epochs: int,
                    batch_size: int) -> tf.keras.models.Model:
    """
    create and run distilbert on training and testing data
    """
    logger.info('build distilbert')

    with strategy.scope():
        transformer_layer = TFDistilBertModel.from_pretrained(MODEL)
        model = build_model(transformer_layer, max_len=max_len)
    model.summary()

    # train given model
    n_steps = x_train.shape[0] // batch_size
    history = model.fit(train_dataset,
                        steps_per_epoch=n_steps,
                        validation_data=valid_dataset,
                        epochs=epochs)
    plot_train_val_loss(history, 'distilbert')

    n_steps = x_valid.shape[0] // batch_size
    _train_history_2 = model.fit(valid_dataset.repeat(),
                                 steps_per_epoch=n_steps,
                                 epochs=epochs * 2)

    scores = model.predict(test_dataset, verbose=1)
    logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}")

    return model
Пример #2
0
def simple_rnn(strategy: tf.distribute.TPUStrategy, x_train_padded: np.array,
               x_valid_padded: np.array, y_train: np.array, y_valid: np.array,
               max_len: int, embedding_size_x: int, embedding_size_y: int, epochs: int) -> tf.keras.models.Sequential:
    """
    create and run simple rnn on training and testing data
    """
    logger.info('build simple RNN')

    with strategy.scope():
        # A simpleRNN without any pretrained embeddings and one dense layer
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Embedding(embedding_size_x, embedding_size_y,
                                            input_length=max_len))
        model.add(tf.keras.layers.SimpleRNN(100))
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam', metrics=['accuracy'])

    model.summary()

    # run model train
    history = model.fit(x_train_padded, y_train, epochs=epochs, batch_size=64 *
                        strategy.num_replicas_in_sync)
    plot_train_val_loss(history, 'simple_rnn')

    scores = model.predict(x_valid_padded)
    logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}")

    return model
Пример #3
0
def run_gru(strategy: tf.distribute.TPUStrategy, x_train_padded: np.array,
            x_valid_padded: np.array, y_train: np.array, y_valid: np.array,
            max_len: int, embedding_size_x: int, embedding_size_y: int,
            embedding_matrix: np.array, epochs: int) -> tf.keras.models.Sequential:
    """
    create and run gru on training and testing data
    """
    logger.info('build gru')

    # build GRU model
    with strategy.scope():
        # GRU with glove embeddings and dense layer
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Embedding(embedding_size_x,
                                            embedding_size_y,
                                            weights=[embedding_matrix],
                                            input_length=max_len,
                                            trainable=False))
        model.add(tf.keras.layers.SpatialDropout1D(0.3))
        model.add(tf.keras.layers.GRU(embedding_size_y))
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='adam', metrics=['accuracy'])

    model.summary()

    history = model.fit(x_train_padded, y_train, epochs=epochs, batch_size=64 *
                        strategy.num_replicas_in_sync)
    plot_train_val_loss(history, 'gru')

    scores = model.predict(x_valid_padded)
    logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}")

    return model
Пример #4
0
def run_rnn(strategy: tf.distribute.TPUStrategy, x_train_padded: np.array,
            x_valid_padded: np.array, y_train: np.array, y_valid: np.array,
            max_len: int, embedding_size_x: int, embedding_size_y: int,
            embedding_matrix: np.array,
            epochs: int) -> tf.keras.models.Sequential:
    """
    create and run bidirectional rnn on training and testing data
    """
    logger.info('build rnn')

    with strategy.scope():
        # A simple bidirectional LSTM with glove embeddings and one dense layer
        model = tf.keras.models.Sequential()
        model.add(
            tf.keras.layers.Embedding(embedding_size_x,
                                      embedding_size_y,
                                      weights=[embedding_matrix],
                                      input_length=max_len,
                                      trainable=False))
        model.add(
            tf.keras.layers.Bidirectional(
                tf.keras.layers.LSTM(embedding_size_y,
                                     dropout=0.3,
                                     recurrent_dropout=0.3)))

        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

    model.summary()

    # fit rnn model, plot the training output
    history = model.fit(x_train_padded,
                        y_train,
                        epochs=epochs,
                        batch_size=64 * strategy.num_replicas_in_sync)
    plot_train_val_loss(history, 'bidirectional_lstm')

    scores = model.predict(x_valid_padded)
    logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}")

    return model
Пример #5
0
def train_test(
    clean_data: pd.DataFrame
) -> Tuple[tf.keras.models.Sequential, tf.keras.models.Sequential]:
    """
    train test
    creates the tensorflow models for sentiment analysis
    """
    logger.info('run training and testing for lstm and cnn')

    all_reviews: List[str] = clean_data[review_key]
    labels: List[int] = clean_data[class_key]

    train_reviews, test_reviews, train_labels, test_labels = train_test_split(
        all_reviews, labels, test_size=TEST_SIZE)
    train_reviews, validation_reviews, train_labels, validation_labels = train_test_split(
        train_reviews, train_labels, test_size=VALIDATION_SIZE)

    # create the training, validation, and testing datasets
    training_dataset = tf.data.Dataset.from_tensor_slices(
        (train_reviews, train_labels))
    validation_dataset = tf.data.Dataset.from_tensor_slices(
        (validation_reviews, validation_labels))
    testing_dataset = tf.data.Dataset.from_tensor_slices(
        (test_reviews, test_labels))

    # buffer size is used to shuffle the dataset
    buffer_size = 10000
    training_dataset = training_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)
    validation_dataset = validation_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)
    testing_dataset = testing_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)

    # print some samples
    logger.success('training data sample:')
    for input_example, target_example in training_dataset.take(1):
        logger.info(f"\ninput: {input_example}\ntarget: {target_example}")

    vocab_size = 10000
    sequence_length = 250

    vectorize_layer = TextVectorization(standardize=standardize_text,
                                        max_tokens=vocab_size,
                                        output_mode='int',
                                        output_sequence_length=sequence_length)

    autotune = tf.data.experimental.AUTOTUNE
    training_dataset = training_dataset.cache().prefetch(buffer_size=autotune)

    train_text = training_dataset.map(lambda x, y: x)
    vectorize_layer.adapt(train_text)

    embedding_dim = 16

    embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    # create lstm model
    lstm_model = tf.keras.models.Sequential([
        vectorize_layer,
        embedding_layer,
        tf.keras.layers.LSTM(256),
        tf.keras.layers.Dense(64, activation='relu'),
        output_layer,
    ])

    learning_rate: int = 1e-3
    optimizer = tf.keras.optimizers.Adam(learning_rate)
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

    # save accuracy after each epoch
    metrics = [tf.keras.metrics.BinaryAccuracy()]

    lstm_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    _lstm_callbacks = [UseMaxWeights()]
    # tried setting callbacks in the fit function below to lstm_callbacks
    # to use the max of all hidden states as the context vector for prediction
    # it did not work as well as using the last hidden state, so I am
    # not using the callback

    # run training and save history
    hist = lstm_model.fit(training_dataset,
                          epochs=12,
                          callbacks=[],
                          validation_data=validation_dataset)

    plot_train_val_loss(hist, 'reviews_lstm')

    logger.info('lstm model summary:')
    lstm_model.summary()

    loss_metric, accuracy = lstm_model.evaluate(testing_dataset)
    logger.info(f'loss: {loss_metric}, accuracy: {accuracy}')

    classes = range(2)

    # don't print these metrics for equally-sized datasets

    # precision, recall, f_score, support = get_precision_recall_fscore(
    #     lstm_model, testing_dataset, classes)

    # for i in classes:
    #     logger.info(f'{reviews_class_map[i]}:')
    #     logger.info(f'precision: {precision[i]}')
    #     logger.info(f'recall: {recall[i]}')
    #     logger.info(f'f-score: {f_score[i]}')
    #     logger.info(f'support: {support[i]}')

    # create cnn model
    cnn_model = tf.keras.models.Sequential([
        vectorize_layer,
        embedding_layer,
        tf.keras.layers.ZeroPadding1D(1),
        tf.keras.layers.Conv1D(32, 2, activation='relu'),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(3),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        output_layer,
    ])

    cnn_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # save cnn model history
    hist = cnn_model.fit(training_dataset,
                         epochs=16,
                         callbacks=[],
                         validation_data=validation_dataset)

    plot_train_val_loss(hist, 'reviews_cnn')

    logger.info('cnn model summary')
    cnn_model.summary()

    # print the loss and accuracy at the end on the testing dataset
    loss_metric, accuracy = cnn_model.evaluate(testing_dataset)
    logger.info(f'loss: {loss_metric}, accuracy: {accuracy}')

    # precision, recall, f_score, support = get_precision_recall_fscore(
    #     cnn_model, testing_dataset, classes)

    # for i in classes:
    #     logger.info(f'{reviews_class_map[i]}:')
    #     logger.info(f'precision: {precision[i]}')
    #     logger.info(f'recall: {recall[i]}')
    #     logger.info(f'f-score: {f_score[i]}')
    #     logger.info(f'support: {support[i]}')

    return lstm_model, cnn_model
Пример #6
0
def train_test(
    clean_data: pd.DataFrame, label_list: List[BookType]
) -> Tuple[tf.keras.models.Sequential, tf.keras.models.Sequential]:
    """
    train test
    run training and testing for book classification
    """
    logger.info('run training and testing for lstm and cnn')

    all_paragraphs: List[str] = [
        ' '.join(paragraph) for paragraph in clean_data[paragraph_key]
    ]
    labels: List[int] = clean_data[class_key]

    train_paragraphs, test_paragraphs, train_labels, test_labels = train_test_split(
        all_paragraphs, labels, test_size=TEST_SIZE)
    train_paragraphs, validation_paragraphs, train_labels, validation_labels = train_test_split(
        train_paragraphs, train_labels, test_size=VALIDATION_SIZE)

    # create training, validation and testing datasets
    training_dataset = tf.data.Dataset.from_tensor_slices(
        (train_paragraphs, train_labels))
    validation_dataset = tf.data.Dataset.from_tensor_slices(
        (validation_paragraphs, validation_labels))
    testing_dataset = tf.data.Dataset.from_tensor_slices(
        (test_paragraphs, test_labels))

    # buffer size is used to shuffle the dataset
    buffer_size = 10000
    # shuffle and batch datasets
    training_dataset = training_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)
    validation_dataset = validation_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)
    testing_dataset = testing_dataset.shuffle(buffer_size).batch(
        batch_size, drop_remainder=True)

    # print some samples
    logger.success('training data sample:')
    for input_example, target_example in training_dataset.take(1):
        logger.info(f"\ninput: {input_example}\ntarget: {target_example}")

    vocab_size = 10000
    sequence_length = 250

    # decided not to use pre-trained embedding layer (https://www.tensorflow.org/hub/tutorials/cord_19_embeddings_keras#training_a_citaton_intent_classifier)
    # because I already wrote this text vectorizor system, and did not want to redo
    # the same work. I tried using the universal sentence encoder from tf-hub:
    # https://tfhub.dev/google/universal-sentence-encoder/4
    # the proof of concept worked well, but implementing it would require changing
    # all of the code below.

    # create vectorization layer
    vectorize_layer = TextVectorization(standardize=standardize_text,
                                        max_tokens=vocab_size,
                                        output_mode='int',
                                        output_sequence_length=sequence_length)

    autotune = tf.data.experimental.AUTOTUNE
    training_dataset = training_dataset.cache().prefetch(buffer_size=autotune)

    train_text = training_dataset.map(lambda x, y: x)
    # adapt vectorization layer to text input
    vectorize_layer.adapt(train_text)

    embedding_dim = 16

    embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    num_classes: int = len(label_list)

    output_layer = tf.keras.layers.Dense(num_classes)

    # create lstm model
    lstm_model = tf.keras.models.Sequential([
        vectorize_layer,
        embedding_layer,
        tf.keras.layers.LSTM(128),
        tf.keras.layers.Dense(64, activation='relu'),
        output_layer,
    ])

    learning_rate: int = 1e-3
    # create optimizer and loss function
    optimizer = tf.keras.optimizers.Adam(learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # save accuracy metric on each epoch
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

    lstm_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # optional callback, described below
    _lstm_callbacks = [UseMaxWeights()]
    # tried setting callbacks in the fit function below to lstm_callbacks
    # to use the max of all hidden states as the context vector for prediction
    # it did not work as well as using the last hidden state, so I am
    # not using the callback

    # train model and save history
    hist = lstm_model.fit(training_dataset,
                          epochs=12,
                          callbacks=[],
                          validation_data=validation_dataset)

    # plot training loss
    plot_train_val_loss(hist, 'books_lstm')

    # print model summary
    logger.info('lstm model summary:')
    lstm_model.summary()

    loss_metric, accuracy = lstm_model.evaluate(testing_dataset)
    logger.info(f'loss: {loss_metric}, accuracy: {accuracy}')

    classes = range(num_classes)

    precision, recall, f_score, support = get_precision_recall_fscore(
        lstm_model, testing_dataset, classes)

    for i in classes:
        current_book = label_list[i]
        logger.info(f'{class_map[current_book]}:')
        logger.info(f'precision: {precision[i]}')
        logger.info(f'recall: {recall[i]}')
        logger.info(f'f-score: {f_score[i]}')
        logger.info(f'support: {support[i]}')

    cnn_model = tf.keras.models.Sequential([
        vectorize_layer,
        embedding_layer,
        tf.keras.layers.ZeroPadding1D(1),
        tf.keras.layers.Conv1D(32, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(3),
        tf.keras.layers.Conv1D(24, 2, activation='relu'),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        output_layer,
    ])

    cnn_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    hist = cnn_model.fit(training_dataset,
                         epochs=14,
                         callbacks=[],
                         validation_data=validation_dataset)

    plot_train_val_loss(hist, 'books_cnn')

    logger.info('cnn model summary')
    cnn_model.summary()

    loss_metric, accuracy = cnn_model.evaluate(testing_dataset)
    logger.info(f'loss: {loss_metric}, accuracy: {accuracy}')

    precision, recall, f_score, support = get_precision_recall_fscore(
        cnn_model, testing_dataset, classes)

    for i in classes:
        current_book = label_list[i]
        logger.info(f'{class_map[current_book]}:')
        logger.info(f'precision: {precision[i]}')
        logger.info(f'recall: {recall[i]}')
        logger.info(f'f-score: {f_score[i]}')
        logger.info(f'support: {support[i]}')

    return lstm_model, cnn_model
Пример #7
0
    logger.info('')

loss = torch.nn.MSELoss(reduction='sum')
# alternative loss: MAE
torch.nn.L1Loss(reduction='sum')  # MAE

best_val_loss = math.inf
best_epoch = 0

logger.info("Starting training...")
for epoch in range(1, args.n_epochs):
    train_loss = train(model, train_loader, optimizer, loss, device, scheduler,
                       logger if args.verbose else None)
    logger.info("Epoch {}: Training Loss {}".format(epoch, train_loss))

    val_loss = test(model, val_loader, loss, device, log_dir, epoch)
    logger.info("Epoch {}: Validation Loss {}".format(epoch, val_loss))
    if scheduler and not isinstance(scheduler, NoamLR):
        scheduler.step(val_loss)

    if val_loss <= best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch
        # torch.save(model.state_dict(), os.path.join(log_dir, 'best_model'))

logger.info("Best Validation Loss {} on Epoch {}".format(
    best_val_loss, best_epoch))

log_file = os.path.join(log_dir, log_file_name + '.log')
plot_train_val_loss(log_file)