def main(argv): options = Options() parser = argparse.ArgumentParser() parser.add_argument("--classifier-model-path", type=str, required=True) parser.add_argument("--training-path", type=str, required=True) parser.add_argument("--inference-path", type=str, required=True) parser.add_argument("--embeddings-path", type=str, required=True) parser.add_argument("--language-model-path", type=str, required=True) parser.parse_known_args(args=argv, namespace=options) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") logger.info(options) index_label_file_path = os.path.join( options.training_path, global_config.index_to_label_dict_file) with open(index_label_file_path, 'r') as index_label_file: index_label_dict = json.load(index_label_file) style_transfer_scores = list() content_preservation_scores = list() word_overlap_scores = list() ll_scores = list() for label_index in index_label_dict: actual_text_file_path = os.path.join( options.inference_path, "actual_sentences_{}.txt".format(label_index)) generated_text_file_path = os.path.join( options.inference_path, "generated_sentences_{}.txt".format(label_index)) [style_transfer_score, _] = style_transfer.get_style_transfer_score( options.classifier_model_path, generated_text_file_path, label_index) [content_preservation_score, word_overlap_score] = \ content_preservation.run_content_preservation_evaluator( actual_text_file_path, generated_text_file_path, options.embeddings_path) ll_score = language_model_evaluator.score_generated_sentences( generated_text_file_path, options.language_model_path) style_transfer_scores.append(style_transfer_score) content_preservation_scores.append(content_preservation_score) word_overlap_scores.append(word_overlap_score) ll_scores.append(ll_score) logger.info("style_transfer_scores: {}".format(style_transfer_scores)) logger.info( "content_preservation_scores: {}".format(content_preservation_scores)) logger.info("word_overlap_scores: {}".format(word_overlap_scores)) logger.info("ll_scores: {}".format(ll_scores)) logger.info("transfer-strength: {}".format( statistics.mean(style_transfer_scores))) logger.info("content-preservation: {}".format( statistics.mean(content_preservation_scores))) logger.info("word-overlap: {}".format( statistics.mean(word_overlap_scores))) logger.info("log-likelihood: {}".format(statistics.mean(ll_scores)))
def main(argv): options = Options() parser = argparse.ArgumentParser() parser.add_argument("--text-file-path", type=str, required=True) parser.add_argument("--label-file-path", type=str, required=True) parser.add_argument("--logging-level", type=str, required=True) parser.parse_args(args=argv, namespace=options) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, options.logging_level) build_word_statistics(options.text_file_path, options.label_file_path)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--predictions-file-path", type=str, required=True) parser.add_argument("--gold-labels-file-path", type=str, required=True) parser.add_argument("--saved-model-path", type=str) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "DEBUG") options = parser.parse_args(args=argv) get_classification_accuracy(options.predictions_file_path, options.gold_labels_file_path, options.saved_model_path)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--text-file-path", type=str, required=True) parser.add_argument("--model-file-path", type=str, required=True) parser.add_argument("--logging-level", type=str, default="INFO") options = vars(parser.parse_args(args=argv)) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, options['logging_level']) train_word2vec_model(options['text_file_path'], options['model_file_path']) logger.info("Training Complete!")
def main(argv): options = Options() parser = argparse.ArgumentParser() parser.add_argument("--source-file-path", type=str, required=True) parser.add_argument("--target-file-path", type=str, required=True) parser.parse_known_args(args=argv, namespace=options) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") logger.info("Starting to clean source file") strip_punctuation(options.source_file_path, options.target_file_path) logger.info("Concluded cleaning source file")
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--embeddings-file-path", type=str, required=True) parser.add_argument("--source-file-path", type=str, required=True) parser.add_argument("--target-file-path", type=str, required=True) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "DEBUG") options = vars(parser.parse_args(args=argv)) [content_preservation_score, word_overlap_score] = run_content_preservation_evaluator( options["source_file_path"], options["target_file_path"], options["embeddings_file_path"]) logger.info("Aggregate content preservation: {}".format(content_preservation_score)) logger.info("Aggregate word overlap: {}".format(word_overlap_score))
def main(argv): options = Options() parser = argparse.ArgumentParser() parser.add_argument("--generated-text-file-path", type=str, required=True) parser.add_argument("--language-model-path", type=str, required=True) parser.add_argument("--use-kenlm", action="store_true", default=False) parser.parse_known_args(args=argv, namespace=options) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") ll_score = score_generated_sentences(options.generated_text_file_path, options.language_model_path) logger.info("ll_score: {}".format(ll_score))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--text-file-path", type=str, required=True) parser.add_argument("--label-file-path", type=str, required=True) parser.add_argument("--vocab-size", type=int, default=1000) parser.add_argument("--training-epochs", type=int, default=10) parser.add_argument("--logging-level", type=str, default="INFO") options = vars(parser.parse_args(args=argv)) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, options['logging_level']) os.makedirs(global_config.classifier_save_directory) train_classifier_model(options) logger.info("Training Complete!")
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--classifier-saved-model-path", type=str) parser.add_argument("--text-file-path", type=str, required=True) parser.add_argument("--label-index", type=int, required=True) args_namespace = parser.parse_args(argv) command_line_args = vars(args_namespace) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") [style_transfer_score, confusion_matrix] = \ get_style_transfer_score(command_line_args['classifier_saved_model_path'], command_line_args['text_file_path'], command_line_args['label_index']) logger.info("style_transfer_score: {}".format(style_transfer_score)) logger.info("confusion_matrix: {}".format(confusion_matrix))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--saved-model-path", type=str) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") args = vars(parser.parse_args(args=argv)) logger.info(args) with open( os.path.join(args["saved_model_path"], global_config.index_to_label_dict_file), 'r') as file: label_names = json.load(file) logger.info("label_names: {}".format(label_names)) with open( os.path.join(args["saved_model_path"], global_config.style_coordinates_file), 'rb') as pickle_file: (style_coordinates, markers) = pickle.load(pickle_file) plot_coordinates( style_coordinates, os.path.join(args["saved_model_path"], global_config.style_embedding_plot_file), markers, label_names, 0) with open( os.path.join(args["saved_model_path"], global_config.content_coordinates_file), 'rb') as pickle_file: (content_coordinates, markers) = pickle.load(pickle_file) plot_coordinates( content_coordinates, os.path.join(args["saved_model_path"], global_config.content_embedding_plot_file), markers, label_names, 1)
def main(argv): options = Options() parser = argparse.ArgumentParser() parser.add_argument("--logging-level", type=str, default="INFO") run_mode = parser.add_mutually_exclusive_group(required=True) run_mode.add_argument("--train-model", action="store_true", default=False) run_mode.add_argument("--transform-text", action="store_true", default=False) run_mode.add_argument("--generate-novel-text", action="store_true", default=False) parser.parse_known_args(args=argv, namespace=options) if options.train_model: parser.add_argument("--vocab-size", type=int, default=1000) parser.add_argument("--training-epochs", type=int, default=10) parser.add_argument("--text-file-path", type=str, required=True) parser.add_argument("--label-file-path", type=str, required=True) parser.add_argument("--validation-text-file-path", type=str, required=True) parser.add_argument("--validation-label-file-path", type=str, required=True) parser.add_argument("--training-embeddings-file-path", type=str) parser.add_argument("--validation-embeddings-file-path", type=str, required=True) parser.add_argument("--dump-embeddings", action="store_true", default=False) parser.add_argument("--classifier-saved-model-path", type=str, required=True) if options.transform_text: parser.add_argument("--saved-model-path", type=str, required=True) parser.add_argument("--evaluation-text-file-path", type=str, required=True) parser.add_argument("--evaluation-label-file-path", type=str, required=True) if options.generate_novel_text: parser.add_argument("--saved-model-path", type=str, required=True) parser.add_argument("--num-sentences-to-generate", type=int, default=1000, required=True) parser.add_argument("--label-index", type=int, default=1000, required=False) parser.parse_known_args(args=argv, namespace=options) global logger logger = log_initializer.setup_custom_logger(global_config.logger_name, options.logging_level) if not (options.train_model or options.transform_text or options.generate_novel_text): logger.info("Nothing to do. Exiting ...") sys.exit(0) global_config.training_epochs = options.training_epochs logger.info("experiment_timestamp: {}".format( global_config.experiment_timestamp)) # Train and save model if options.train_model: os.makedirs(global_config.save_directory) with open(global_config.model_config_file_path, 'w') as model_config_file: json.dump(obj=mconf.__dict__, fp=model_config_file, indent=4) logger.info("Saved model config to {}".format( global_config.model_config_file_path)) # Retrieve all data logger.info("Reading data ...") [ word_index, padded_sequences, text_sequence_lengths, one_hot_labels, num_labels, text_tokenizer, inverse_word_index ] = get_data(options) data_size = padded_sequences.shape[0] encoder_embedding_matrix, decoder_embedding_matrix = \ get_word_embeddings(options.training_embeddings_file_path, word_index) # Build model logger.info("Building model architecture ...") network = adversarial_autoencoder.AdversarialAutoencoder() network.build_model(word_index, encoder_embedding_matrix, decoder_embedding_matrix, num_labels) logger.info("Training model ...") sess = tf_session_helper.get_tensorflow_session() [_, validation_actual_word_lists, validation_sequences, validation_sequence_lengths] = \ data_processor.get_test_sequences( options.validation_text_file_path, text_tokenizer, word_index, inverse_word_index) [_, validation_labels] = \ data_processor.get_test_labels(options.validation_label_file_path, global_config.save_directory) network.train(sess, data_size, padded_sequences, text_sequence_lengths, one_hot_labels, num_labels, word_index, encoder_embedding_matrix, decoder_embedding_matrix, validation_sequences, validation_sequence_lengths, validation_labels, inverse_word_index, validation_actual_word_lists, options) sess.close() logger.info("Training complete!") elif options.transform_text: # Enforce a particular style embedding and regenerate text logger.info("Transforming text style ...") with open( os.path.join(options.saved_model_path, global_config.model_config_file), 'r') as json_file: model_config_dict = json.load(json_file) mconf.init_from_dict(model_config_dict) logger.info("Restored model config from saved JSON") with open( os.path.join(options.saved_model_path, global_config.vocab_save_file), 'r') as json_file: word_index = json.load(json_file) with open( os.path.join(options.saved_model_path, global_config.index_to_label_dict_file), 'r') as json_file: index_to_label_map = json.load(json_file) with open( os.path.join(options.saved_model_path, global_config.average_label_embeddings_file), 'rb') as pickle_file: average_label_embeddings = pickle.load(pickle_file) global_config.vocab_size = len(word_index) num_labels = len(index_to_label_map) text_tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=global_config.vocab_size, filters=global_config.tokenizer_filters) text_tokenizer.word_index = word_index inverse_word_index = {v: k for k, v in word_index.items()} [actual_sequences, _, padded_sequences, text_sequence_lengths] = \ data_processor.get_test_sequences( options.evaluation_text_file_path, text_tokenizer, word_index, inverse_word_index) [label_sequences, _] = \ data_processor.get_test_labels(options.evaluation_label_file_path, options.saved_model_path) logger.info("Building model architecture ...") network = adversarial_autoencoder.AdversarialAutoencoder() encoder_embedding_matrix, decoder_embedding_matrix = get_word_embeddings( None, word_index) network.build_model(word_index, encoder_embedding_matrix, decoder_embedding_matrix, num_labels) sess = tf_session_helper.get_tensorflow_session() total_nll = 0 for i in range(num_labels): logger.info("Style chosen: {}".format(i)) filtered_actual_sequences = list() filtered_padded_sequences = list() filtered_text_sequence_lengths = list() for k in range(len(actual_sequences)): if label_sequences[k] != i: filtered_actual_sequences.append(actual_sequences[k]) filtered_padded_sequences.append(padded_sequences[k]) filtered_text_sequence_lengths.append( text_sequence_lengths[k]) style_embedding = np.asarray(average_label_embeddings[i]) [generated_sequences, final_sequence_lengths, _, _, _, cross_entropy_scores] = \ network.transform_sentences( sess, filtered_padded_sequences, filtered_text_sequence_lengths, style_embedding, num_labels, os.path.join(options.saved_model_path, global_config.model_save_file)) nll = -np.mean(a=cross_entropy_scores, axis=0) total_nll += nll logger.info("NLL: {}".format(nll)) actual_word_lists = \ [data_processor.generate_words_from_indices(x, inverse_word_index) for x in filtered_actual_sequences] execute_post_inference_operations( actual_word_lists, generated_sequences, final_sequence_lengths, inverse_word_index, global_config.experiment_timestamp, i) logger.info("Generation complete for label {}".format(i)) logger.info("Mean NLL: {}".format(total_nll / num_labels)) logger.info("Predicting labels from latent spaces ...") _, _, overall_label_predictions, style_label_predictions, adversarial_label_predictions, _ = \ network.transform_sentences( sess, padded_sequences, text_sequence_lengths, average_label_embeddings[0], num_labels, os.path.join(options.saved_model_path, global_config.model_save_file)) # write label predictions to file output_file_path = "output/{}-inference/overall_labels_prediction.txt".format( global_config.experiment_timestamp) os.makedirs(os.path.dirname(output_file_path), exist_ok=True) with open(output_file_path, 'w') as output_file: for one_hot_label in overall_label_predictions: output_file.write("{}\n".format( one_hot_label.tolist().index(1))) output_file_path = "output/{}-inference/style_labels_prediction.txt".format( global_config.experiment_timestamp) os.makedirs(os.path.dirname(output_file_path), exist_ok=True) with open(output_file_path, 'w') as output_file: for one_hot_label in style_label_predictions: output_file.write("{}\n".format( one_hot_label.tolist().index(1))) output_file_path = "output/{}-inference/adversarial_labels_prediction.txt".format( global_config.experiment_timestamp) os.makedirs(os.path.dirname(output_file_path), exist_ok=True) with open(output_file_path, 'w') as output_file: for one_hot_label in adversarial_label_predictions: output_file.write("{}\n".format( one_hot_label.tolist().index(1))) logger.info("Inference run complete") sess.close() elif options.generate_novel_text: logger.info("Generating novel text") with open( os.path.join(options.saved_model_path, global_config.model_config_file), 'r') as json_file: model_config_dict = json.load(json_file) mconf.init_from_dict(model_config_dict) logger.info("Restored model config from saved JSON") with open( os.path.join(options.saved_model_path, global_config.vocab_save_file), 'r') as json_file: word_index = json.load(json_file) with open( os.path.join(options.saved_model_path, global_config.index_to_label_dict_file), 'r') as json_file: index_to_label_map = json.load(json_file) with open( os.path.join(options.saved_model_path, global_config.average_label_embeddings_file), 'rb') as pickle_file: average_label_embeddings = pickle.load(pickle_file) global_config.vocab_size = len(word_index) inverse_word_index = {v: k for k, v in word_index.items()} num_labels = len(index_to_label_map) text_tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=global_config.vocab_size, filters=global_config.tokenizer_filters) text_tokenizer.word_index = word_index data_processor.populate_word_blacklist(word_index) logger.info("Building model architecture ...") network = adversarial_autoencoder.AdversarialAutoencoder() encoder_embedding_matrix, decoder_embedding_matrix = get_word_embeddings( None, word_index) network.build_model(word_index, encoder_embedding_matrix, decoder_embedding_matrix, num_labels) sess = tf_session_helper.get_tensorflow_session() for label_index in index_to_label_map: if options.label_index and label_index != options.label_index: continue style_embedding = np.asarray( average_label_embeddings[int(label_index)]) generated_sequences, final_sequence_lengths = \ network.generate_novel_sentences( sess, style_embedding, options.num_sentences_to_generate, num_labels, os.path.join(options.saved_model_path, global_config.model_save_file)) # first trims the generates sentences down to the length the decoder returns # then trim any <eos> token trimmed_generated_sequences = \ [[index for index in sequence if index != global_config.predefined_word_index[global_config.eos_token]] for sequence in [x[:(y - 1)] for (x, y) in zip(generated_sequences, final_sequence_lengths)]] generated_word_lists = \ [data_processor.generate_words_from_indices(x, inverse_word_index) for x in trimmed_generated_sequences] generated_sentences = [" ".join(x) for x in generated_word_lists] output_file_path = "output/{}-generation/generated_sentences_{}.txt".format( global_config.experiment_timestamp, label_index) os.makedirs(os.path.dirname(output_file_path), exist_ok=True) with open(output_file_path, 'w') as output_file: for sentence in generated_sentences: output_file.write(sentence + "\n") logger.info("Generated {} sentences of label {} at path {}".format( options.num_sentences_to_generate, index_to_label_map[label_index], output_file_path)) sess.close() logger.info("Generation run complete")
import json import random import re from linguistic_style_transfer_model.config import global_config from linguistic_style_transfer_model.utils import log_initializer logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") raw_lyrics_file_path = "data/lyrics/artist-song-line.top30artists.txt" genre_mapping_file_path = "data/lyrics/artist-genres.json" val_text_file_path = "data/lyrics/lyrics-val.txt" val_artists_file_path = "data/lyrics/artist-val.txt" val_genres_file_path = "data/lyrics/genre-val.txt" test_text_file_path = "data/lyrics/lyrics-test.txt" test_artists_file_path = "data/lyrics/artist-test.txt" test_genres_file_path = "data/lyrics/genre-test.txt" train_text_file_path = "data/lyrics/lyrics-train.txt" train_artists_file_path = "data/lyrics/artist-train.txt" train_genres_file_path = "data/lyrics/genre-train.txt" all_text_file_path = "data/lyrics/lyrics-all.txt" all_artists_file_path = "data/lyrics/artist-all.txt" all_genres_file_path = "data/lyrics/genre-all.txt" dev_proportion = 0.01 test_proportion = 0.05