def evaluate(data_set_file_or_name, data_format=None, data_directory=None, map_features=None, feature_selection=None, example_filter=None, noisy_preprocessing_methods=None, preprocessing_methods=None, split_data_set=None, splitting_method=None, splitting_fraction=None, model_type=None, latent_size=None, hidden_sizes=None, number_of_importance_samples=None, number_of_monte_carlo_samples=None, inference_architecture=None, latent_distribution=None, number_of_classes=None, parameterise_latent_posterior=False, prior_probabilities_method=None, generative_architecture=None, reconstruction_distribution=None, number_of_reconstruction_classes=None, count_sum=None, proportion_of_free_nats_for_y_kl_divergence=None, minibatch_normalisation=None, batch_correction=None, dropout_keep_probabilities=None, number_of_warm_up_epochs=None, kl_weight=None, minibatch_size=None, run_id=None, models_directory=None, included_analyses=None, analysis_level=None, decomposition_methods=None, highlight_feature_indices=None, export_options=None, analyses_directory=None, evaluation_set_kind=None, sample_size=None, prediction_method=None, prediction_training_set_kind=None, model_versions=None, **keyword_arguments): """Evaluate model on data set.""" if split_data_set is None: split_data_set = defaults["data"]["split_data_set"] if splitting_method is None: splitting_method = defaults["data"]["splitting_method"] if splitting_fraction is None: splitting_fraction = defaults["data"]["splitting_fraction"] if models_directory is None: models_directory = defaults["models"]["directory"] if evaluation_set_kind is None: evaluation_set_kind = defaults["evaluation"]["data_set_name"] if sample_size is None: sample_size = defaults["models"]["sample_size"] if prediction_method is None: prediction_method = defaults["evaluation"]["prediction_method"] if prediction_training_set_kind is None: prediction_training_set_kind = defaults["evaluation"][ "prediction_training_set_kind"] if model_versions is None: model_versions = defaults["evaluation"]["model_versions"] if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] evaluation_set_kind = normalise_string(evaluation_set_kind) prediction_training_set_kind = normalise_string( prediction_training_set_kind) model_versions = parse_model_versions(model_versions) print(title("Data")) binarise_values = False if reconstruction_distribution == "bernoulli": if noisy_preprocessing_methods: if noisy_preprocessing_methods[-1] != "binarise": noisy_preprocessing_methods.append("binarise") else: binarise_values = True data_set = DataSet(data_set_file_or_name, data_format=data_format, directory=data_directory, map_features=map_features, feature_selection=feature_selection, example_filter=example_filter, preprocessing_methods=preprocessing_methods, binarise_values=binarise_values, noisy_preprocessing_methods=noisy_preprocessing_methods) if not split_data_set or evaluation_set_kind == "full": data_set.load() if split_data_set: training_set, validation_set, test_set = data_set.split( method=splitting_method, fraction=splitting_fraction) data_subsets = [data_set, training_set, validation_set, test_set] for data_subset in data_subsets: clear_data_subset = True if data_subset.kind == evaluation_set_kind: evaluation_set = data_subset clear_data_subset = False if data_subset.kind == prediction_training_set_kind: prediction_training_set = data_subset clear_data_subset = False if clear_data_subset: data_subset.clear() else: splitting_method = None splitting_fraction = None evaluation_set = data_set prediction_training_set = data_set evaluation_subset_indices = indices_for_evaluation_subset(evaluation_set) models_directory = build_directory_path( models_directory, data_set=evaluation_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) analyses_directory = build_directory_path( analyses_directory, data_set=evaluation_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) print(title("Model")) if number_of_classes is None: if evaluation_set.has_labels: number_of_classes = (evaluation_set.number_of_classes - evaluation_set.number_of_excluded_classes) model = _setup_model( data_set=evaluation_set, model_type=model_type, latent_size=latent_size, hidden_sizes=hidden_sizes, number_of_importance_samples=number_of_importance_samples, number_of_monte_carlo_samples=number_of_monte_carlo_samples, inference_architecture=inference_architecture, latent_distribution=latent_distribution, number_of_classes=number_of_classes, parameterise_latent_posterior=parameterise_latent_posterior, prior_probabilities_method=prior_probabilities_method, generative_architecture=generative_architecture, reconstruction_distribution=reconstruction_distribution, number_of_reconstruction_classes=number_of_reconstruction_classes, count_sum=count_sum, proportion_of_free_nats_for_y_kl_divergence=( proportion_of_free_nats_for_y_kl_divergence), minibatch_normalisation=minibatch_normalisation, batch_correction=batch_correction, dropout_keep_probabilities=dropout_keep_probabilities, number_of_warm_up_epochs=number_of_warm_up_epochs, kl_weight=kl_weight, models_directory=models_directory) if ("best_model" in model_versions and not better_model_exists(model, run_id=run_id)): model_versions.remove("best_model") if ("early_stopping" in model_versions and not model_stopped_early(model, run_id=run_id)): model_versions.remove("early_stopping") print(subtitle("Analysis")) analyses.analyse_model(model=model, run_id=run_id, included_analyses=included_analyses, analysis_level=analysis_level, export_options=export_options, analyses_directory=analyses_directory) print(title("Results")) print("Evaluation set: {} set.".format(evaluation_set.kind)) print("Model version{}: {}.".format( "" if len(model_versions) == 1 else "s", enumerate_strings([v.replace("_", " ") for v in model_versions], conjunction="and"))) if prediction_method: prediction_specifications = PredictionSpecifications( method=prediction_method, number_of_clusters=number_of_classes, training_set_kind=prediction_training_set.kind) print("Prediction method: {}.".format( prediction_specifications.method)) print("Number of clusters: {}.".format( prediction_specifications.number_of_clusters)) print("Prediction training set: {} set.".format( prediction_specifications.training_set_kind)) print() for model_version in model_versions: use_best_model = False use_early_stopping_model = False if model_version == "best_model": use_best_model = True elif model_version == "early_stopping": use_early_stopping_model = True print(subtitle(model_version.replace("_", " ").capitalize())) print( heading("{} evaluation".format( model_version.replace("_", "-").capitalize()))) (transformed_evaluation_set, reconstructed_evaluation_set, latent_evaluation_sets) = model.evaluate( evaluation_set=evaluation_set, evaluation_subset_indices=evaluation_subset_indices, minibatch_size=minibatch_size, run_id=run_id, use_best_model=use_best_model, use_early_stopping_model=use_early_stopping_model, output_versions="all") print() if sample_size: print( heading("{} sampling".format( model_version.replace("_", "-").capitalize()))) sample_reconstruction_set, __ = model.sample( sample_size=sample_size, minibatch_size=minibatch_size, run_id=run_id, use_best_model=use_best_model, use_early_stopping_model=use_early_stopping_model) print() else: sample_reconstruction_set = None if prediction_method: print( heading("{} prediction".format( model_version.replace("_", "-").capitalize()))) latent_prediction_training_sets = model.evaluate( evaluation_set=prediction_training_set, minibatch_size=minibatch_size, run_id=run_id, use_best_model=use_best_model, use_early_stopping_model=use_early_stopping_model, output_versions="latent", log_results=False) print() cluster_ids, predicted_labels, predicted_superset_labels = ( predict_labels( training_set=latent_prediction_training_sets["z"], evaluation_set=latent_evaluation_sets["z"], specifications=prediction_specifications)) evaluation_set_versions = [ transformed_evaluation_set, reconstructed_evaluation_set ] + list(latent_evaluation_sets.values()) for evaluation_set_version in evaluation_set_versions: evaluation_set_version.update_predictions( prediction_specifications=prediction_specifications, predicted_cluster_ids=cluster_ids, predicted_labels=predicted_labels, predicted_superset_labels=predicted_superset_labels) print() print( heading("{} analysis".format( model_version.replace("_", "-").capitalize()))) analyses.analyse_results( evaluation_set=transformed_evaluation_set, reconstructed_evaluation_set=reconstructed_evaluation_set, latent_evaluation_sets=latent_evaluation_sets, model=model, run_id=run_id, sample_reconstruction_set=sample_reconstruction_set, decomposition_methods=decomposition_methods, evaluation_subset_indices=evaluation_subset_indices, highlight_feature_indices=highlight_feature_indices, best_model=use_best_model, early_stopping=use_early_stopping_model, included_analyses=included_analyses, analysis_level=analysis_level, export_options=export_options, analyses_directory=analyses_directory) return 0
def analyse(data_set_file_or_name, data_format=None, data_directory=None, map_features=None, feature_selection=None, example_filter=None, preprocessing_methods=None, split_data_set=None, splitting_method=None, splitting_fraction=None, included_analyses=None, analysis_level=None, decomposition_methods=None, highlight_feature_indices=None, export_options=None, analyses_directory=None, **keyword_arguments): """Analyse data set.""" if split_data_set is None: split_data_set = defaults["data"]["split_data_set"] if splitting_method is None: splitting_method = defaults["data"]["splitting_method"] if splitting_fraction is None: splitting_fraction = defaults["data"]["splitting_fraction"] if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] print(title("Data")) data_set = DataSet( data_set_file_or_name, data_format=data_format, directory=data_directory, map_features=map_features, feature_selection=feature_selection, example_filter=example_filter, preprocessing_methods=preprocessing_methods, ) data_set.load() if split_data_set: training_set, validation_set, test_set = data_set.split( method=splitting_method, fraction=splitting_fraction) all_data_sets = [data_set, training_set, validation_set, test_set] else: all_data_sets = [data_set] splitting_method = None splitting_fraction = None analyses_directory = build_directory_path( analyses_directory, data_set=data_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction, preprocessing=False) print(subtitle("Analysing data")) analyses.analyse_data(data_sets=all_data_sets, decomposition_methods=decomposition_methods, highlight_feature_indices=highlight_feature_indices, included_analyses=included_analyses, analysis_level=analysis_level, export_options=export_options, analyses_directory=analyses_directory) return 0
def train(data_set_file_or_name, data_format=None, data_directory=None, map_features=None, feature_selection=None, example_filter=None, noisy_preprocessing_methods=None, preprocessing_methods=None, split_data_set=None, splitting_method=None, splitting_fraction=None, model_type=None, latent_size=None, hidden_sizes=None, number_of_importance_samples=None, number_of_monte_carlo_samples=None, inference_architecture=None, latent_distribution=None, number_of_classes=None, parameterise_latent_posterior=False, prior_probabilities_method=None, generative_architecture=None, reconstruction_distribution=None, number_of_reconstruction_classes=None, count_sum=None, proportion_of_free_nats_for_y_kl_divergence=None, minibatch_normalisation=None, batch_correction=None, dropout_keep_probabilities=None, number_of_warm_up_epochs=None, kl_weight=None, number_of_epochs=None, minibatch_size=None, learning_rate=None, run_id=None, new_run=False, reset_training=None, models_directory=None, caches_directory=None, analyses_directory=None, **keyword_arguments): """Train model on data set.""" if split_data_set is None: split_data_set = defaults["data"]["split_data_set"] if splitting_method is None: splitting_method = defaults["data"]["splitting_method"] if splitting_fraction is None: splitting_fraction = defaults["data"]["splitting_fraction"] if models_directory is None: models_directory = defaults["models"]["directory"] print(title("Data")) binarise_values = False if reconstruction_distribution == "bernoulli": if noisy_preprocessing_methods: if noisy_preprocessing_methods[-1] != "binarise": noisy_preprocessing_methods.append("binarise") else: binarise_values = True data_set = DataSet(data_set_file_or_name, data_format=data_format, directory=data_directory, map_features=map_features, feature_selection=feature_selection, example_filter=example_filter, preprocessing_methods=preprocessing_methods, binarise_values=binarise_values, noisy_preprocessing_methods=noisy_preprocessing_methods) if split_data_set: training_set, validation_set, __ = data_set.split( method=splitting_method, fraction=splitting_fraction) else: data_set.load() splitting_method = None splitting_fraction = None training_set = data_set validation_set = None models_directory = build_directory_path( models_directory, data_set=data_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) if analyses_directory: analyses_directory = build_directory_path( analyses_directory, data_set=data_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) model_caches_directory = None if caches_directory: model_caches_directory = os.path.join(caches_directory, "log") model_caches_directory = build_directory_path( model_caches_directory, data_set=data_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) print(title("Model")) if number_of_classes is None: if training_set.has_labels: number_of_classes = (training_set.number_of_classes - training_set.number_of_excluded_classes) model = _setup_model( data_set=training_set, model_type=model_type, latent_size=latent_size, hidden_sizes=hidden_sizes, number_of_importance_samples=number_of_importance_samples, number_of_monte_carlo_samples=number_of_monte_carlo_samples, inference_architecture=inference_architecture, latent_distribution=latent_distribution, number_of_classes=number_of_classes, parameterise_latent_posterior=parameterise_latent_posterior, prior_probabilities_method=prior_probabilities_method, generative_architecture=generative_architecture, reconstruction_distribution=reconstruction_distribution, number_of_reconstruction_classes=number_of_reconstruction_classes, count_sum=count_sum, proportion_of_free_nats_for_y_kl_divergence=( proportion_of_free_nats_for_y_kl_divergence), minibatch_normalisation=minibatch_normalisation, batch_correction=batch_correction, dropout_keep_probabilities=dropout_keep_probabilities, number_of_warm_up_epochs=number_of_warm_up_epochs, kl_weight=kl_weight, models_directory=models_directory) print(model.description) print() print(model.parameters) print() print(subtitle("Training")) if analyses_directory: intermediate_analyser = analyses.analyse_intermediate_results else: intermediate_analyser = None model.train(training_set, validation_set, number_of_epochs=number_of_epochs, minibatch_size=minibatch_size, learning_rate=learning_rate, intermediate_analyser=intermediate_analyser, run_id=run_id, new_run=new_run, reset_training=reset_training, analyses_directory=analyses_directory, temporary_log_directory=model_caches_directory) # Remove temporary directories created and emptied during training if model_caches_directory and os.path.exists(caches_directory): remove_empty_directories(caches_directory) return 0