def main(): model = load_model("dickens_verne_sepcnn_model.h5") dev, dlabels = load_data.load_data("data/merged/dev.txt") test, tlabels = load_data.load_data("data/merged/test.txt") vtest, vdev, wids = vectorize_data.sequence_vectorize(dev, test) preds = model.predict(vtest) authors = [ "Verne", "Dickens" ] for i, t in enumerate(test): p = preds[i][0] pred = int(round(p)) p_auth = authors[pred] a_auth = authors[tlabels[i]] s = "%s %.4f ==> %s (actual: %s)" % (t, p, p_auth, a_auth) if p_auth != a_auth: print(s) if i > 350: break
def train_sequence_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, blocks=2, filters=64, dropout_rate=0.3, embedding_dim=200, kernel_size=3, pool_size=3): (train_texts, train_labels), (val_texts, val_labels) = data num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError( 'Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) x_train, x_val, word_index = vectorize_data.sequence_vectorize( train_texts, val_texts) num_features = min(len(word_index) + 1, TOP_K) model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features) if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] history = model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, batch_size=batch_size) history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) model.save('rotten_tomatoes_sepcnn_model.h5') return history['val_acc'][-1], history['val_loss'][-1]
def train_fine_tuned_sequence_model(data, embedding_data_dir, learning_rate=1e-3, epochs=1000, batch_size=128, blocks=2, filters=64, dropout_rate=0.2, embedding_dim=200, kernel_size=3, pool_size=3): """Trains sequence model on the given dataset. # Arguments data: tuples of training and test texts and labels. embedding_data_dir: string, path to the pre-training embeddings. learning_rate: float, learning rate for training model. epochs: int, number of epochs. batch_size: int, number of samples per batch. blocks: int, number of pairs of sepCNN and pooling blocks in the model. filters: int, output dimension of sepCNN layers in the model. dropout_rate: float: percentage of input to drop at Dropout layers. embedding_dim: int, dimension of the embedding vectors. kernel_size: int, length of the convolution window. pool_size: int, factor by which to downscale input at MaxPooling layer. # Raises ValueError: If validation data has label values which were not seen in the training data. """ # Get the data. (train_texts, train_labels), (val_texts, val_labels) = data # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError('Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format( unexpected_labels=unexpected_labels)) # Vectorize texts. x_train, x_val, word_index = vectorize_data.sequence_vectorize( train_texts, val_texts) # Number of features will be the embedding input dimension. Add 1 for the # reserved index 0. num_features = min(len(word_index) + 1, TOP_K) embedding_matrix = _get_embedding_matrix( word_index, embedding_data_dir, embedding_dim) # Create model instance. First time we will train rest of network while # keeping embedding layer weights frozen. So, we set # is_embedding_trainable as False. model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features, use_pretrained_embedding=True, is_embedding_trainable=False, embedding_matrix=embedding_matrix) # Compile model with learning parameters. if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Create callback for early stopping on validation loss. If the loss does # not decrease in two consecutive tries, stop training. callbacks = [tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=2)] # Train and validate model. model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs once per epoch. batch_size=batch_size) # Save the model. model.save_weights('sequence_model_with_pre_trained_embedding.h5') # Create another model instance. This time we will unfreeze the embedding # layer and let it fine-tune to the given dataset. model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features, use_pretrained_embedding=True, is_embedding_trainable=True, embedding_matrix=embedding_matrix) # Compile model with learning parameters. model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Load the weights that we had saved into this new model. model.load_weights('sequence_model_with_pre_trained_embedding.h5') # Train and validate model. history = model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs once per epoch. batch_size=batch_size) # Print results. history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # Save model. model.save('tweet_weather_sepcnn_fine_tuned_model.h5') return history['val_acc'][-1], history['val_loss'][-1]
def batch_train_sequence_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, blocks=2, filters=64, dropout_rate=0.2, embedding_dim=200, kernel_size=3, pool_size=3): """Trains sequence model on the given dataset. # Arguments data: tuples of training and test texts and labels. learning_rate: float, learning rate for training model. epochs: int, number of epochs. batch_size: int, number of samples per batch. blocks: int, number of pairs of sepCNN and pooling blocks in the model. filters: int, output dimension of sepCNN layers in the model. dropout_rate: float: percentage of input to drop at Dropout layers. embedding_dim: int, dimension of the embedding vectors. kernel_size: int, length of the convolution window. pool_size: int, factor by which to downscale input at MaxPooling layer. # Raises ValueError: If validation data has label values which were not seen in the training data. """ # Get the data. (train_texts, train_labels), (val_texts, val_labels) = data # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError( 'Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) # Vectorize texts. x_train, x_val, word_index = vectorize_data.sequence_vectorize( train_texts, val_texts) # Number of features will be the embedding input dimension. Add 1 for the # reserved index 0. num_features = min(len(word_index) + 1, TOP_K) # Create model instance. model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features) # Compile model with learning parameters. if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Create callback for early stopping on validation loss. If the loss does # not decrease in two consecutive tries, stop training. callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] # Create training and validation generators. training_generator = _data_generator(x_train, train_labels, num_features, batch_size) validation_generator = _data_generator(x_val, val_labels, num_features, batch_size) # Get number of training steps. This indicated the number of steps it takes # to cover all samples in one epoch. steps_per_epoch = x_train.shape[0] // batch_size if x_train.shape[0] % batch_size: steps_per_epoch += 1 # Get number of validation steps. validation_steps = x_val.shape[0] // batch_size if x_val.shape[0] % batch_size: validation_steps += 1 # Train and validate model. history = model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, validation_data=validation_generator, validation_steps=validation_steps, callbacks=callbacks, epochs=epochs, verbose=2) # Logs once per epoch. # Print results. history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # Save model. model.save('amazon_reviews_sepcnn_model.h5') return history['val_acc'][-1], history['val_loss'][-1]