def test_train_ngram_model(): data_dir = './data/' data = load_data.load_imdb_sentiment_analysis_dataset(data_dir) # data = load_data.load_rotten_tomatoes_sentiment_analysis_dataset(data_dir) acc, loss = train_ngram_model.train_ngram_model(data) assert acc == pytest.approx(0.91, 0.02) assert loss == pytest.approx(0.24, 0.02)
units=units)) params['accuracy'].append(accuracy) _plot_parameters(params) def _plot_parameters(params): """Creates a 3D surface plot of given parameters. # Arguments params: dict, contains layers, units and accuracy value combinations. """ fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_trisurf(params['layers'], params['units'], params['accuracy'], cmap=cm.coolwarm, antialiased=False) plt.show() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='./data', help='input data directory') FLAGS, unparsed = parser.parse_known_args() # Using the IMDb movie reviews dataset to demonstrate training n-gram model data = load_data.load_imdb_sentiment_analysis_dataset(FLAGS.data_dir) tune_ngram_model(data)
epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs once per epoch. batch_size=batch_size) # Print results. history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # Save model. model.save('rotten_tomatoes_sepcnn_model.h5') return history['val_acc'][-1], history['val_loss'][-1] if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='./data', help='input data directory') FLAGS, unparsed = parser.parse_known_args() # Using the Rotten tomatoes movie reviews dataset to demonstrate # training sequence model. data = load_data.load_imdb_sentiment_analysis_dataset( load_data.sentences_train, load_data.sentences_test, load_data.y_train, load_data.y_test) train_sequence_model(data)
# ------------------- """ Create a /data folder in this repo Download v1 dataset to /data folder from https://ai.stanford.edu/~amaas/data/sentiment/ Extract contents into /data/aclImdb """ # Step 2: Explore Data # -------------------- # Load the dataset data_dir = './data/' # NOTE: Only pulling in 5000 samples to allow running locally data_tuple = load_data.load_imdb_sentiment_analysis_dataset(data_dir, seed=150, max_samples=5000) # (train_texts, train_labels), (val_texts, val_labels) = data_tuple # The two charts in the course # explore_data.plot_frequency_distribution_of_ngrams(train_texts) # explore_data.plot_sample_length_distribution(train_texts) # Additional data exploring functions # explore_data.get_num_words_per_sample(train_texts) # explore_data.plot_class_distribution(train_labels) # Step 3: Prepare Data # -> Tokenization and vecotrization included in train_ngram_model() # -------------------- # N-gram Tokenization into unigrams and bigrams
def plot_graphs(history, metric): plt.plot(history.history[metric]) plt.plot(history.history['val_'+metric], '') plt.xlabel("Epochs") plt.ylabel(metric) plt.legend([metric, 'val_'+metric]) def custom_standardization(input_data): lowercase = tf.strings.lower(input_data)#转换为小写 stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')#取出html的符号 return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')#去除标点 # In[] dataset_dir = os.path.join("../dataset", 'aclImdb') """1. load data""" (train_texts, train_labels), (val_texts, val_labels) = load_data.load_imdb_sentiment_analysis_dataset("../dataset") print("load data finished!") # In[] train_texts_dataset=tf.data.Dataset.from_tensor_slices(train_texts) test_texts_dataset=tf.data.Dataset.from_tensor_slices(val_texts) # In[] VOCAB_SIZE=2000 sequence_length = 100 # if tf version is lower than 2., use tf.keras.layers.experimental.preprocessing.TextVectorization vectorizer = tf.keras.layers.TextVectorization( standardize="lower_and_strip_punctuation", split="whitespace", max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=sequence_length)#The tensors of indices are 0-padded to the longest sequence in the batch (unless you set a fixed output_sequence_length):
import numpy as np import os import re import tensorflow as tf import string import matplotlib.pyplot as plt import pickle import load_data from tensorflow.keras.models import load_model # In[] dataset_dir = os.path.join("../dataset", 'aclImdb') """1. load data""" (train_texts, train_labels), ( val_texts, val_labels) = load_data.load_imdb_sentiment_analysis_dataset("../dataset") print("load data finished!") # In[] test_texts_dataset = tf.data.Dataset.from_tensor_slices(val_texts) from_disk = pickle.load(open("./models/tv_layer.pkl", "rb")) if from_disk['config']['output_mode'] == 'int': from_disk['config']['output_mode'] = 'int' new_v = tf.keras.layers.TextVectorization.from_config(from_disk['config']) # You have to call `adapt` with some dummy data (BUG in Keras) new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"])) new_v.set_weights(from_disk['weights']) # In[]