def main(argv):
    options = Options()
    parser = argparse.ArgumentParser()
    parser.add_argument("--classifier-model-path", type=str, required=True)
    parser.add_argument("--training-path", type=str, required=True)
    parser.add_argument("--inference-path", type=str, required=True)
    parser.add_argument("--embeddings-path", type=str, required=True)
    parser.add_argument("--language-model-path", type=str, required=True)
    parser.parse_known_args(args=argv, namespace=options)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name,
                                                 "INFO")
    logger.info(options)

    index_label_file_path = os.path.join(
        options.training_path, global_config.index_to_label_dict_file)
    with open(index_label_file_path, 'r') as index_label_file:
        index_label_dict = json.load(index_label_file)

    style_transfer_scores = list()
    content_preservation_scores = list()
    word_overlap_scores = list()
    ll_scores = list()

    for label_index in index_label_dict:
        actual_text_file_path = os.path.join(
            options.inference_path,
            "actual_sentences_{}.txt".format(label_index))
        generated_text_file_path = os.path.join(
            options.inference_path,
            "generated_sentences_{}.txt".format(label_index))

        [style_transfer_score, _] = style_transfer.get_style_transfer_score(
            options.classifier_model_path, generated_text_file_path,
            label_index)
        [content_preservation_score, word_overlap_score] = \
            content_preservation.run_content_preservation_evaluator(
                actual_text_file_path, generated_text_file_path, options.embeddings_path)
        ll_score = language_model_evaluator.score_generated_sentences(
            generated_text_file_path, options.language_model_path)

        style_transfer_scores.append(style_transfer_score)
        content_preservation_scores.append(content_preservation_score)
        word_overlap_scores.append(word_overlap_score)
        ll_scores.append(ll_score)

    logger.info("style_transfer_scores: {}".format(style_transfer_scores))
    logger.info(
        "content_preservation_scores: {}".format(content_preservation_scores))
    logger.info("word_overlap_scores: {}".format(word_overlap_scores))
    logger.info("ll_scores: {}".format(ll_scores))

    logger.info("transfer-strength: {}".format(
        statistics.mean(style_transfer_scores)))
    logger.info("content-preservation: {}".format(
        statistics.mean(content_preservation_scores)))
    logger.info("word-overlap: {}".format(
        statistics.mean(word_overlap_scores)))
    logger.info("log-likelihood: {}".format(statistics.mean(ll_scores)))
Пример #2
0
def main(argv):
    options = Options()

    parser = argparse.ArgumentParser()
    parser.add_argument("--text-file-path", type=str, required=True)
    parser.add_argument("--label-file-path", type=str, required=True)
    parser.add_argument("--logging-level", type=str, required=True)
    parser.parse_args(args=argv, namespace=options)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name, options.logging_level)
    build_word_statistics(options.text_file_path, options.label_file_path)
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--predictions-file-path", type=str, required=True)
    parser.add_argument("--gold-labels-file-path", type=str, required=True)
    parser.add_argument("--saved-model-path", type=str)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name, "DEBUG")

    options = parser.parse_args(args=argv)
    get_classification_accuracy(options.predictions_file_path, options.gold_labels_file_path,
                                options.saved_model_path)
Пример #4
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--text-file-path", type=str, required=True)
    parser.add_argument("--model-file-path", type=str, required=True)
    parser.add_argument("--logging-level", type=str, default="INFO")

    options = vars(parser.parse_args(args=argv))
    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name, options['logging_level'])

    train_word2vec_model(options['text_file_path'], options['model_file_path'])

    logger.info("Training Complete!")
Пример #5
0
def main(argv):
    options = Options()

    parser = argparse.ArgumentParser()
    parser.add_argument("--source-file-path", type=str, required=True)
    parser.add_argument("--target-file-path", type=str, required=True)
    parser.parse_known_args(args=argv, namespace=options)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO")

    logger.info("Starting to clean source file")
    strip_punctuation(options.source_file_path, options.target_file_path)
    logger.info("Concluded cleaning source file")
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--embeddings-file-path", type=str, required=True)
    parser.add_argument("--source-file-path", type=str, required=True)
    parser.add_argument("--target-file-path", type=str, required=True)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name, "DEBUG")

    options = vars(parser.parse_args(args=argv))
    [content_preservation_score, word_overlap_score] = run_content_preservation_evaluator(
        options["source_file_path"], options["target_file_path"], options["embeddings_file_path"])

    logger.info("Aggregate content preservation: {}".format(content_preservation_score))
    logger.info("Aggregate word overlap: {}".format(word_overlap_score))
Пример #7
0
def main(argv):
    options = Options()
    parser = argparse.ArgumentParser()
    parser.add_argument("--generated-text-file-path", type=str, required=True)
    parser.add_argument("--language-model-path", type=str, required=True)
    parser.add_argument("--use-kenlm", action="store_true", default=False)

    parser.parse_known_args(args=argv, namespace=options)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name,
                                                 "INFO")

    ll_score = score_generated_sentences(options.generated_text_file_path,
                                         options.language_model_path)
    logger.info("ll_score: {}".format(ll_score))
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--text-file-path", type=str, required=True)
    parser.add_argument("--label-file-path", type=str, required=True)
    parser.add_argument("--vocab-size", type=int, default=1000)
    parser.add_argument("--training-epochs", type=int, default=10)
    parser.add_argument("--logging-level", type=str, default="INFO")

    options = vars(parser.parse_args(args=argv))
    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name,
                                                 options['logging_level'])

    os.makedirs(global_config.classifier_save_directory)

    train_classifier_model(options)

    logger.info("Training Complete!")
Пример #9
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--classifier-saved-model-path", type=str)
    parser.add_argument("--text-file-path", type=str, required=True)
    parser.add_argument("--label-index", type=int, required=True)
    args_namespace = parser.parse_args(argv)
    command_line_args = vars(args_namespace)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name,
                                                 "INFO")

    [style_transfer_score, confusion_matrix] = \
        get_style_transfer_score(command_line_args['classifier_saved_model_path'],
                                 command_line_args['text_file_path'],
                                 command_line_args['label_index'])
    logger.info("style_transfer_score: {}".format(style_transfer_score))
    logger.info("confusion_matrix: {}".format(confusion_matrix))
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--saved-model-path", type=str)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name,
                                                 "INFO")

    args = vars(parser.parse_args(args=argv))
    logger.info(args)

    with open(
            os.path.join(args["saved_model_path"],
                         global_config.index_to_label_dict_file), 'r') as file:
        label_names = json.load(file)
    logger.info("label_names: {}".format(label_names))

    with open(
            os.path.join(args["saved_model_path"],
                         global_config.style_coordinates_file),
            'rb') as pickle_file:
        (style_coordinates, markers) = pickle.load(pickle_file)
        plot_coordinates(
            style_coordinates,
            os.path.join(args["saved_model_path"],
                         global_config.style_embedding_plot_file), markers,
            label_names, 0)

    with open(
            os.path.join(args["saved_model_path"],
                         global_config.content_coordinates_file),
            'rb') as pickle_file:
        (content_coordinates, markers) = pickle.load(pickle_file)
        plot_coordinates(
            content_coordinates,
            os.path.join(args["saved_model_path"],
                         global_config.content_embedding_plot_file), markers,
            label_names, 1)
Пример #11
0
def main(argv):
    options = Options()

    parser = argparse.ArgumentParser()
    parser.add_argument("--logging-level", type=str, default="INFO")
    run_mode = parser.add_mutually_exclusive_group(required=True)
    run_mode.add_argument("--train-model", action="store_true", default=False)
    run_mode.add_argument("--transform-text",
                          action="store_true",
                          default=False)
    run_mode.add_argument("--generate-novel-text",
                          action="store_true",
                          default=False)

    parser.parse_known_args(args=argv, namespace=options)
    if options.train_model:
        parser.add_argument("--vocab-size", type=int, default=1000)
        parser.add_argument("--training-epochs", type=int, default=10)
        parser.add_argument("--text-file-path", type=str, required=True)
        parser.add_argument("--label-file-path", type=str, required=True)
        parser.add_argument("--validation-text-file-path",
                            type=str,
                            required=True)
        parser.add_argument("--validation-label-file-path",
                            type=str,
                            required=True)
        parser.add_argument("--training-embeddings-file-path", type=str)
        parser.add_argument("--validation-embeddings-file-path",
                            type=str,
                            required=True)
        parser.add_argument("--dump-embeddings",
                            action="store_true",
                            default=False)
        parser.add_argument("--classifier-saved-model-path",
                            type=str,
                            required=True)
    if options.transform_text:
        parser.add_argument("--saved-model-path", type=str, required=True)
        parser.add_argument("--evaluation-text-file-path",
                            type=str,
                            required=True)
        parser.add_argument("--evaluation-label-file-path",
                            type=str,
                            required=True)
    if options.generate_novel_text:
        parser.add_argument("--saved-model-path", type=str, required=True)
        parser.add_argument("--num-sentences-to-generate",
                            type=int,
                            default=1000,
                            required=True)
        parser.add_argument("--label-index",
                            type=int,
                            default=1000,
                            required=False)

    parser.parse_known_args(args=argv, namespace=options)

    global logger
    logger = log_initializer.setup_custom_logger(global_config.logger_name,
                                                 options.logging_level)

    if not (options.train_model or options.transform_text
            or options.generate_novel_text):
        logger.info("Nothing to do. Exiting ...")
        sys.exit(0)

    global_config.training_epochs = options.training_epochs
    logger.info("experiment_timestamp: {}".format(
        global_config.experiment_timestamp))

    # Train and save model
    if options.train_model:
        os.makedirs(global_config.save_directory)
        with open(global_config.model_config_file_path,
                  'w') as model_config_file:
            json.dump(obj=mconf.__dict__, fp=model_config_file, indent=4)
        logger.info("Saved model config to {}".format(
            global_config.model_config_file_path))

        # Retrieve all data
        logger.info("Reading data ...")
        [
            word_index, padded_sequences, text_sequence_lengths,
            one_hot_labels, num_labels, text_tokenizer, inverse_word_index
        ] = get_data(options)
        data_size = padded_sequences.shape[0]

        encoder_embedding_matrix, decoder_embedding_matrix = \
            get_word_embeddings(options.training_embeddings_file_path, word_index)

        # Build model
        logger.info("Building model architecture ...")
        network = adversarial_autoencoder.AdversarialAutoencoder()
        network.build_model(word_index, encoder_embedding_matrix,
                            decoder_embedding_matrix, num_labels)

        logger.info("Training model ...")
        sess = tf_session_helper.get_tensorflow_session()

        [_, validation_actual_word_lists, validation_sequences, validation_sequence_lengths] = \
            data_processor.get_test_sequences(
                options.validation_text_file_path, text_tokenizer, word_index, inverse_word_index)
        [_, validation_labels] = \
            data_processor.get_test_labels(options.validation_label_file_path, global_config.save_directory)

        network.train(sess, data_size, padded_sequences, text_sequence_lengths,
                      one_hot_labels, num_labels, word_index,
                      encoder_embedding_matrix, decoder_embedding_matrix,
                      validation_sequences, validation_sequence_lengths,
                      validation_labels, inverse_word_index,
                      validation_actual_word_lists, options)
        sess.close()

        logger.info("Training complete!")

    elif options.transform_text:
        # Enforce a particular style embedding and regenerate text
        logger.info("Transforming text style ...")

        with open(
                os.path.join(options.saved_model_path,
                             global_config.model_config_file),
                'r') as json_file:
            model_config_dict = json.load(json_file)
            mconf.init_from_dict(model_config_dict)
            logger.info("Restored model config from saved JSON")

        with open(
                os.path.join(options.saved_model_path,
                             global_config.vocab_save_file), 'r') as json_file:
            word_index = json.load(json_file)
        with open(
                os.path.join(options.saved_model_path,
                             global_config.index_to_label_dict_file),
                'r') as json_file:
            index_to_label_map = json.load(json_file)
        with open(
                os.path.join(options.saved_model_path,
                             global_config.average_label_embeddings_file),
                'rb') as pickle_file:
            average_label_embeddings = pickle.load(pickle_file)

        global_config.vocab_size = len(word_index)

        num_labels = len(index_to_label_map)
        text_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=global_config.vocab_size,
            filters=global_config.tokenizer_filters)
        text_tokenizer.word_index = word_index

        inverse_word_index = {v: k for k, v in word_index.items()}
        [actual_sequences, _, padded_sequences, text_sequence_lengths] = \
            data_processor.get_test_sequences(
                options.evaluation_text_file_path, text_tokenizer, word_index, inverse_word_index)
        [label_sequences, _] = \
            data_processor.get_test_labels(options.evaluation_label_file_path, options.saved_model_path)

        logger.info("Building model architecture ...")
        network = adversarial_autoencoder.AdversarialAutoencoder()
        encoder_embedding_matrix, decoder_embedding_matrix = get_word_embeddings(
            None, word_index)
        network.build_model(word_index, encoder_embedding_matrix,
                            decoder_embedding_matrix, num_labels)

        sess = tf_session_helper.get_tensorflow_session()

        total_nll = 0
        for i in range(num_labels):
            logger.info("Style chosen: {}".format(i))

            filtered_actual_sequences = list()
            filtered_padded_sequences = list()
            filtered_text_sequence_lengths = list()
            for k in range(len(actual_sequences)):
                if label_sequences[k] != i:
                    filtered_actual_sequences.append(actual_sequences[k])
                    filtered_padded_sequences.append(padded_sequences[k])
                    filtered_text_sequence_lengths.append(
                        text_sequence_lengths[k])

            style_embedding = np.asarray(average_label_embeddings[i])
            [generated_sequences, final_sequence_lengths, _, _, _, cross_entropy_scores] = \
                network.transform_sentences(
                    sess, filtered_padded_sequences, filtered_text_sequence_lengths, style_embedding,
                    num_labels, os.path.join(options.saved_model_path, global_config.model_save_file))
            nll = -np.mean(a=cross_entropy_scores, axis=0)
            total_nll += nll
            logger.info("NLL: {}".format(nll))

            actual_word_lists = \
                [data_processor.generate_words_from_indices(x, inverse_word_index)
                 for x in filtered_actual_sequences]

            execute_post_inference_operations(
                actual_word_lists, generated_sequences, final_sequence_lengths,
                inverse_word_index, global_config.experiment_timestamp, i)

            logger.info("Generation complete for label {}".format(i))

        logger.info("Mean NLL: {}".format(total_nll / num_labels))

        logger.info("Predicting labels from latent spaces ...")
        _, _, overall_label_predictions, style_label_predictions, adversarial_label_predictions, _ = \
            network.transform_sentences(
                sess, padded_sequences, text_sequence_lengths, average_label_embeddings[0],
                num_labels, os.path.join(options.saved_model_path, global_config.model_save_file))

        # write label predictions to file
        output_file_path = "output/{}-inference/overall_labels_prediction.txt".format(
            global_config.experiment_timestamp)
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, 'w') as output_file:
            for one_hot_label in overall_label_predictions:
                output_file.write("{}\n".format(
                    one_hot_label.tolist().index(1)))

        output_file_path = "output/{}-inference/style_labels_prediction.txt".format(
            global_config.experiment_timestamp)
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, 'w') as output_file:
            for one_hot_label in style_label_predictions:
                output_file.write("{}\n".format(
                    one_hot_label.tolist().index(1)))

        output_file_path = "output/{}-inference/adversarial_labels_prediction.txt".format(
            global_config.experiment_timestamp)
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, 'w') as output_file:
            for one_hot_label in adversarial_label_predictions:
                output_file.write("{}\n".format(
                    one_hot_label.tolist().index(1)))

        logger.info("Inference run complete")

        sess.close()

    elif options.generate_novel_text:
        logger.info("Generating novel text")

        with open(
                os.path.join(options.saved_model_path,
                             global_config.model_config_file),
                'r') as json_file:
            model_config_dict = json.load(json_file)
            mconf.init_from_dict(model_config_dict)
            logger.info("Restored model config from saved JSON")

        with open(
                os.path.join(options.saved_model_path,
                             global_config.vocab_save_file), 'r') as json_file:
            word_index = json.load(json_file)
        with open(
                os.path.join(options.saved_model_path,
                             global_config.index_to_label_dict_file),
                'r') as json_file:
            index_to_label_map = json.load(json_file)
        with open(
                os.path.join(options.saved_model_path,
                             global_config.average_label_embeddings_file),
                'rb') as pickle_file:
            average_label_embeddings = pickle.load(pickle_file)

        global_config.vocab_size = len(word_index)
        inverse_word_index = {v: k for k, v in word_index.items()}

        num_labels = len(index_to_label_map)
        text_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=global_config.vocab_size,
            filters=global_config.tokenizer_filters)
        text_tokenizer.word_index = word_index
        data_processor.populate_word_blacklist(word_index)

        logger.info("Building model architecture ...")
        network = adversarial_autoencoder.AdversarialAutoencoder()
        encoder_embedding_matrix, decoder_embedding_matrix = get_word_embeddings(
            None, word_index)
        network.build_model(word_index, encoder_embedding_matrix,
                            decoder_embedding_matrix, num_labels)

        sess = tf_session_helper.get_tensorflow_session()

        for label_index in index_to_label_map:
            if options.label_index and label_index != options.label_index:
                continue

            style_embedding = np.asarray(
                average_label_embeddings[int(label_index)])
            generated_sequences, final_sequence_lengths = \
                network.generate_novel_sentences(
                    sess, style_embedding, options.num_sentences_to_generate, num_labels,
                    os.path.join(options.saved_model_path, global_config.model_save_file))

            # first trims the generates sentences down to the length the decoder returns
            # then trim any <eos> token
            trimmed_generated_sequences = \
                [[index for index in sequence
                  if index != global_config.predefined_word_index[global_config.eos_token]]
                 for sequence in [x[:(y - 1)] for (x, y) in zip(generated_sequences, final_sequence_lengths)]]

            generated_word_lists = \
                [data_processor.generate_words_from_indices(x, inverse_word_index)
                 for x in trimmed_generated_sequences]

            generated_sentences = [" ".join(x) for x in generated_word_lists]
            output_file_path = "output/{}-generation/generated_sentences_{}.txt".format(
                global_config.experiment_timestamp, label_index)
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
            with open(output_file_path, 'w') as output_file:
                for sentence in generated_sentences:
                    output_file.write(sentence + "\n")

            logger.info("Generated {} sentences of label {} at path {}".format(
                options.num_sentences_to_generate,
                index_to_label_map[label_index], output_file_path))

        sess.close()
        logger.info("Generation run complete")
import json
import random
import re

from linguistic_style_transfer_model.config import global_config
from linguistic_style_transfer_model.utils import log_initializer

logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO")

raw_lyrics_file_path = "data/lyrics/artist-song-line.top30artists.txt"
genre_mapping_file_path = "data/lyrics/artist-genres.json"

val_text_file_path = "data/lyrics/lyrics-val.txt"
val_artists_file_path = "data/lyrics/artist-val.txt"
val_genres_file_path = "data/lyrics/genre-val.txt"

test_text_file_path = "data/lyrics/lyrics-test.txt"
test_artists_file_path = "data/lyrics/artist-test.txt"
test_genres_file_path = "data/lyrics/genre-test.txt"

train_text_file_path = "data/lyrics/lyrics-train.txt"
train_artists_file_path = "data/lyrics/artist-train.txt"
train_genres_file_path = "data/lyrics/genre-train.txt"

all_text_file_path = "data/lyrics/lyrics-all.txt"
all_artists_file_path = "data/lyrics/artist-all.txt"
all_genres_file_path = "data/lyrics/genre-all.txt"

dev_proportion = 0.01
test_proportion = 0.05