예제 #1
0
def evaluate(data_set_file_or_name,
             data_format=None,
             data_directory=None,
             map_features=None,
             feature_selection=None,
             example_filter=None,
             noisy_preprocessing_methods=None,
             preprocessing_methods=None,
             split_data_set=None,
             splitting_method=None,
             splitting_fraction=None,
             model_type=None,
             latent_size=None,
             hidden_sizes=None,
             number_of_importance_samples=None,
             number_of_monte_carlo_samples=None,
             inference_architecture=None,
             latent_distribution=None,
             number_of_classes=None,
             parameterise_latent_posterior=False,
             prior_probabilities_method=None,
             generative_architecture=None,
             reconstruction_distribution=None,
             number_of_reconstruction_classes=None,
             count_sum=None,
             proportion_of_free_nats_for_y_kl_divergence=None,
             minibatch_normalisation=None,
             batch_correction=None,
             dropout_keep_probabilities=None,
             number_of_warm_up_epochs=None,
             kl_weight=None,
             minibatch_size=None,
             run_id=None,
             models_directory=None,
             included_analyses=None,
             analysis_level=None,
             decomposition_methods=None,
             highlight_feature_indices=None,
             export_options=None,
             analyses_directory=None,
             evaluation_set_kind=None,
             sample_size=None,
             prediction_method=None,
             prediction_training_set_kind=None,
             model_versions=None,
             **keyword_arguments):
    """Evaluate model on data set."""

    if split_data_set is None:
        split_data_set = defaults["data"]["split_data_set"]
    if splitting_method is None:
        splitting_method = defaults["data"]["splitting_method"]
    if splitting_fraction is None:
        splitting_fraction = defaults["data"]["splitting_fraction"]
    if models_directory is None:
        models_directory = defaults["models"]["directory"]
    if evaluation_set_kind is None:
        evaluation_set_kind = defaults["evaluation"]["data_set_name"]
    if sample_size is None:
        sample_size = defaults["models"]["sample_size"]
    if prediction_method is None:
        prediction_method = defaults["evaluation"]["prediction_method"]
    if prediction_training_set_kind is None:
        prediction_training_set_kind = defaults["evaluation"][
            "prediction_training_set_kind"]
    if model_versions is None:
        model_versions = defaults["evaluation"]["model_versions"]
    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    evaluation_set_kind = normalise_string(evaluation_set_kind)
    prediction_training_set_kind = normalise_string(
        prediction_training_set_kind)
    model_versions = parse_model_versions(model_versions)

    print(title("Data"))

    binarise_values = False
    if reconstruction_distribution == "bernoulli":
        if noisy_preprocessing_methods:
            if noisy_preprocessing_methods[-1] != "binarise":
                noisy_preprocessing_methods.append("binarise")
        else:
            binarise_values = True

    data_set = DataSet(data_set_file_or_name,
                       data_format=data_format,
                       directory=data_directory,
                       map_features=map_features,
                       feature_selection=feature_selection,
                       example_filter=example_filter,
                       preprocessing_methods=preprocessing_methods,
                       binarise_values=binarise_values,
                       noisy_preprocessing_methods=noisy_preprocessing_methods)

    if not split_data_set or evaluation_set_kind == "full":
        data_set.load()

    if split_data_set:
        training_set, validation_set, test_set = data_set.split(
            method=splitting_method, fraction=splitting_fraction)
        data_subsets = [data_set, training_set, validation_set, test_set]
        for data_subset in data_subsets:
            clear_data_subset = True
            if data_subset.kind == evaluation_set_kind:
                evaluation_set = data_subset
                clear_data_subset = False
            if data_subset.kind == prediction_training_set_kind:
                prediction_training_set = data_subset
                clear_data_subset = False
            if clear_data_subset:
                data_subset.clear()
    else:
        splitting_method = None
        splitting_fraction = None
        evaluation_set = data_set
        prediction_training_set = data_set

    evaluation_subset_indices = indices_for_evaluation_subset(evaluation_set)

    models_directory = build_directory_path(
        models_directory,
        data_set=evaluation_set,
        splitting_method=splitting_method,
        splitting_fraction=splitting_fraction)
    analyses_directory = build_directory_path(
        analyses_directory,
        data_set=evaluation_set,
        splitting_method=splitting_method,
        splitting_fraction=splitting_fraction)

    print(title("Model"))

    if number_of_classes is None:
        if evaluation_set.has_labels:
            number_of_classes = (evaluation_set.number_of_classes -
                                 evaluation_set.number_of_excluded_classes)

    model = _setup_model(
        data_set=evaluation_set,
        model_type=model_type,
        latent_size=latent_size,
        hidden_sizes=hidden_sizes,
        number_of_importance_samples=number_of_importance_samples,
        number_of_monte_carlo_samples=number_of_monte_carlo_samples,
        inference_architecture=inference_architecture,
        latent_distribution=latent_distribution,
        number_of_classes=number_of_classes,
        parameterise_latent_posterior=parameterise_latent_posterior,
        prior_probabilities_method=prior_probabilities_method,
        generative_architecture=generative_architecture,
        reconstruction_distribution=reconstruction_distribution,
        number_of_reconstruction_classes=number_of_reconstruction_classes,
        count_sum=count_sum,
        proportion_of_free_nats_for_y_kl_divergence=(
            proportion_of_free_nats_for_y_kl_divergence),
        minibatch_normalisation=minibatch_normalisation,
        batch_correction=batch_correction,
        dropout_keep_probabilities=dropout_keep_probabilities,
        number_of_warm_up_epochs=number_of_warm_up_epochs,
        kl_weight=kl_weight,
        models_directory=models_directory)

    if ("best_model" in model_versions
            and not better_model_exists(model, run_id=run_id)):
        model_versions.remove("best_model")

    if ("early_stopping" in model_versions
            and not model_stopped_early(model, run_id=run_id)):
        model_versions.remove("early_stopping")

    print(subtitle("Analysis"))

    analyses.analyse_model(model=model,
                           run_id=run_id,
                           included_analyses=included_analyses,
                           analysis_level=analysis_level,
                           export_options=export_options,
                           analyses_directory=analyses_directory)

    print(title("Results"))

    print("Evaluation set: {} set.".format(evaluation_set.kind))
    print("Model version{}: {}.".format(
        "" if len(model_versions) == 1 else "s",
        enumerate_strings([v.replace("_", " ") for v in model_versions],
                          conjunction="and")))

    if prediction_method:
        prediction_specifications = PredictionSpecifications(
            method=prediction_method,
            number_of_clusters=number_of_classes,
            training_set_kind=prediction_training_set.kind)
        print("Prediction method: {}.".format(
            prediction_specifications.method))
        print("Number of clusters: {}.".format(
            prediction_specifications.number_of_clusters))
        print("Prediction training set: {} set.".format(
            prediction_specifications.training_set_kind))

    print()

    for model_version in model_versions:

        use_best_model = False
        use_early_stopping_model = False
        if model_version == "best_model":
            use_best_model = True
        elif model_version == "early_stopping":
            use_early_stopping_model = True

        print(subtitle(model_version.replace("_", " ").capitalize()))

        print(
            heading("{} evaluation".format(
                model_version.replace("_", "-").capitalize())))

        (transformed_evaluation_set, reconstructed_evaluation_set,
         latent_evaluation_sets) = model.evaluate(
             evaluation_set=evaluation_set,
             evaluation_subset_indices=evaluation_subset_indices,
             minibatch_size=minibatch_size,
             run_id=run_id,
             use_best_model=use_best_model,
             use_early_stopping_model=use_early_stopping_model,
             output_versions="all")
        print()

        if sample_size:
            print(
                heading("{} sampling".format(
                    model_version.replace("_", "-").capitalize())))

            sample_reconstruction_set, __ = model.sample(
                sample_size=sample_size,
                minibatch_size=minibatch_size,
                run_id=run_id,
                use_best_model=use_best_model,
                use_early_stopping_model=use_early_stopping_model)
            print()
        else:
            sample_reconstruction_set = None

        if prediction_method:
            print(
                heading("{} prediction".format(
                    model_version.replace("_", "-").capitalize())))

            latent_prediction_training_sets = model.evaluate(
                evaluation_set=prediction_training_set,
                minibatch_size=minibatch_size,
                run_id=run_id,
                use_best_model=use_best_model,
                use_early_stopping_model=use_early_stopping_model,
                output_versions="latent",
                log_results=False)
            print()

            cluster_ids, predicted_labels, predicted_superset_labels = (
                predict_labels(
                    training_set=latent_prediction_training_sets["z"],
                    evaluation_set=latent_evaluation_sets["z"],
                    specifications=prediction_specifications))

            evaluation_set_versions = [
                transformed_evaluation_set, reconstructed_evaluation_set
            ] + list(latent_evaluation_sets.values())

            for evaluation_set_version in evaluation_set_versions:
                evaluation_set_version.update_predictions(
                    prediction_specifications=prediction_specifications,
                    predicted_cluster_ids=cluster_ids,
                    predicted_labels=predicted_labels,
                    predicted_superset_labels=predicted_superset_labels)
            print()

        print(
            heading("{} analysis".format(
                model_version.replace("_", "-").capitalize())))

        analyses.analyse_results(
            evaluation_set=transformed_evaluation_set,
            reconstructed_evaluation_set=reconstructed_evaluation_set,
            latent_evaluation_sets=latent_evaluation_sets,
            model=model,
            run_id=run_id,
            sample_reconstruction_set=sample_reconstruction_set,
            decomposition_methods=decomposition_methods,
            evaluation_subset_indices=evaluation_subset_indices,
            highlight_feature_indices=highlight_feature_indices,
            best_model=use_best_model,
            early_stopping=use_early_stopping_model,
            included_analyses=included_analyses,
            analysis_level=analysis_level,
            export_options=export_options,
            analyses_directory=analyses_directory)

    return 0
예제 #2
0
def analyse(data_set_file_or_name,
            data_format=None,
            data_directory=None,
            map_features=None,
            feature_selection=None,
            example_filter=None,
            preprocessing_methods=None,
            split_data_set=None,
            splitting_method=None,
            splitting_fraction=None,
            included_analyses=None,
            analysis_level=None,
            decomposition_methods=None,
            highlight_feature_indices=None,
            export_options=None,
            analyses_directory=None,
            **keyword_arguments):
    """Analyse data set."""

    if split_data_set is None:
        split_data_set = defaults["data"]["split_data_set"]
    if splitting_method is None:
        splitting_method = defaults["data"]["splitting_method"]
    if splitting_fraction is None:
        splitting_fraction = defaults["data"]["splitting_fraction"]
    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    print(title("Data"))

    data_set = DataSet(
        data_set_file_or_name,
        data_format=data_format,
        directory=data_directory,
        map_features=map_features,
        feature_selection=feature_selection,
        example_filter=example_filter,
        preprocessing_methods=preprocessing_methods,
    )
    data_set.load()

    if split_data_set:
        training_set, validation_set, test_set = data_set.split(
            method=splitting_method, fraction=splitting_fraction)
        all_data_sets = [data_set, training_set, validation_set, test_set]
    else:
        all_data_sets = [data_set]
        splitting_method = None
        splitting_fraction = None

    analyses_directory = build_directory_path(
        analyses_directory,
        data_set=data_set,
        splitting_method=splitting_method,
        splitting_fraction=splitting_fraction,
        preprocessing=False)

    print(subtitle("Analysing data"))

    analyses.analyse_data(data_sets=all_data_sets,
                          decomposition_methods=decomposition_methods,
                          highlight_feature_indices=highlight_feature_indices,
                          included_analyses=included_analyses,
                          analysis_level=analysis_level,
                          export_options=export_options,
                          analyses_directory=analyses_directory)

    return 0
예제 #3
0
def train(data_set_file_or_name,
          data_format=None,
          data_directory=None,
          map_features=None,
          feature_selection=None,
          example_filter=None,
          noisy_preprocessing_methods=None,
          preprocessing_methods=None,
          split_data_set=None,
          splitting_method=None,
          splitting_fraction=None,
          model_type=None,
          latent_size=None,
          hidden_sizes=None,
          number_of_importance_samples=None,
          number_of_monte_carlo_samples=None,
          inference_architecture=None,
          latent_distribution=None,
          number_of_classes=None,
          parameterise_latent_posterior=False,
          prior_probabilities_method=None,
          generative_architecture=None,
          reconstruction_distribution=None,
          number_of_reconstruction_classes=None,
          count_sum=None,
          proportion_of_free_nats_for_y_kl_divergence=None,
          minibatch_normalisation=None,
          batch_correction=None,
          dropout_keep_probabilities=None,
          number_of_warm_up_epochs=None,
          kl_weight=None,
          number_of_epochs=None,
          minibatch_size=None,
          learning_rate=None,
          run_id=None,
          new_run=False,
          reset_training=None,
          models_directory=None,
          caches_directory=None,
          analyses_directory=None,
          **keyword_arguments):
    """Train model on data set."""

    if split_data_set is None:
        split_data_set = defaults["data"]["split_data_set"]
    if splitting_method is None:
        splitting_method = defaults["data"]["splitting_method"]
    if splitting_fraction is None:
        splitting_fraction = defaults["data"]["splitting_fraction"]
    if models_directory is None:
        models_directory = defaults["models"]["directory"]

    print(title("Data"))

    binarise_values = False
    if reconstruction_distribution == "bernoulli":
        if noisy_preprocessing_methods:
            if noisy_preprocessing_methods[-1] != "binarise":
                noisy_preprocessing_methods.append("binarise")
        else:
            binarise_values = True

    data_set = DataSet(data_set_file_or_name,
                       data_format=data_format,
                       directory=data_directory,
                       map_features=map_features,
                       feature_selection=feature_selection,
                       example_filter=example_filter,
                       preprocessing_methods=preprocessing_methods,
                       binarise_values=binarise_values,
                       noisy_preprocessing_methods=noisy_preprocessing_methods)

    if split_data_set:
        training_set, validation_set, __ = data_set.split(
            method=splitting_method, fraction=splitting_fraction)
    else:
        data_set.load()
        splitting_method = None
        splitting_fraction = None
        training_set = data_set
        validation_set = None

    models_directory = build_directory_path(
        models_directory,
        data_set=data_set,
        splitting_method=splitting_method,
        splitting_fraction=splitting_fraction)

    if analyses_directory:
        analyses_directory = build_directory_path(
            analyses_directory,
            data_set=data_set,
            splitting_method=splitting_method,
            splitting_fraction=splitting_fraction)

    model_caches_directory = None
    if caches_directory:
        model_caches_directory = os.path.join(caches_directory, "log")
        model_caches_directory = build_directory_path(
            model_caches_directory,
            data_set=data_set,
            splitting_method=splitting_method,
            splitting_fraction=splitting_fraction)

    print(title("Model"))

    if number_of_classes is None:
        if training_set.has_labels:
            number_of_classes = (training_set.number_of_classes -
                                 training_set.number_of_excluded_classes)

    model = _setup_model(
        data_set=training_set,
        model_type=model_type,
        latent_size=latent_size,
        hidden_sizes=hidden_sizes,
        number_of_importance_samples=number_of_importance_samples,
        number_of_monte_carlo_samples=number_of_monte_carlo_samples,
        inference_architecture=inference_architecture,
        latent_distribution=latent_distribution,
        number_of_classes=number_of_classes,
        parameterise_latent_posterior=parameterise_latent_posterior,
        prior_probabilities_method=prior_probabilities_method,
        generative_architecture=generative_architecture,
        reconstruction_distribution=reconstruction_distribution,
        number_of_reconstruction_classes=number_of_reconstruction_classes,
        count_sum=count_sum,
        proportion_of_free_nats_for_y_kl_divergence=(
            proportion_of_free_nats_for_y_kl_divergence),
        minibatch_normalisation=minibatch_normalisation,
        batch_correction=batch_correction,
        dropout_keep_probabilities=dropout_keep_probabilities,
        number_of_warm_up_epochs=number_of_warm_up_epochs,
        kl_weight=kl_weight,
        models_directory=models_directory)

    print(model.description)
    print()

    print(model.parameters)
    print()

    print(subtitle("Training"))

    if analyses_directory:
        intermediate_analyser = analyses.analyse_intermediate_results
    else:
        intermediate_analyser = None

    model.train(training_set,
                validation_set,
                number_of_epochs=number_of_epochs,
                minibatch_size=minibatch_size,
                learning_rate=learning_rate,
                intermediate_analyser=intermediate_analyser,
                run_id=run_id,
                new_run=new_run,
                reset_training=reset_training,
                analyses_directory=analyses_directory,
                temporary_log_directory=model_caches_directory)

    # Remove temporary directories created and emptied during training
    if model_caches_directory and os.path.exists(caches_directory):
        remove_empty_directories(caches_directory)

    return 0