def main(data_set_name, data_directory, log_directory, results_directory, splitting_method, splitting_fraction, latent_size, hidden_sizes, reconstruction_distribution, number_of_reconstruction_classes, number_of_warm_up_epochs, number_of_epochs, batch_size, learning_rate, reset_training): # Data data_set = data.DataSet(data_set_name, data_directory) training_set, validation_set, test_set = data_set.split( splitting_method, splitting_fraction) feature_size = data_set.number_of_features # Modeling model = modeling.VariationalAutoEncoder( feature_size, latent_size, hidden_sizes, reconstruction_distribution, number_of_reconstruction_classes, ) log_directory = os.path.join(log_directory, data_set_name, model.name) model.train(training_set, validation_set, number_of_epochs, batch_size, learning_rate, # number_of_warm_up_epochs, log_directory, reset_training) # Analysis results_directory = os.path.join(results_directory, data_set_name, model.name) analysis.analyseModel(log_directory, results_directory) reconstructed_test_set, latent_set, test_metrics = model.evaluate(test_set, batch_size, log_directory) analysis.analyseResults(test_set, reconstructed_test_set, latent_set, results_directory)
def main(input_file_or_name, data_directory = "data", log_directory = "log", results_directory = "results", temporary_log_directory = None, map_features = False, feature_selection = [], example_filter = [], preprocessing_methods = [], noisy_preprocessing_methods = [], split_data_set = True, splitting_method = "default", splitting_fraction = 0.9, model_type = "VAE", latent_size = 50, hidden_sizes = [500], number_of_importance_samples = [5], number_of_monte_carlo_samples = [10], inference_architecture = "MLP", latent_distribution = "gaussian", number_of_classes = None, parameterise_latent_posterior = False, generative_architecture = "MLP", reconstruction_distribution = "poisson", number_of_reconstruction_classes = 0, prior_probabilities_method = "uniform", number_of_warm_up_epochs = 0, kl_weight = 1, proportion_of_free_KL_nats = 0.0, batch_normalisation = True, dropout_keep_probabilities = [], count_sum = True, number_of_epochs = 200, plotting_interval_during_training = None, batch_size = 100, learning_rate = 1e-4, run_id = None, new_run = False, prediction_method = None, prediction_training_set_name = "training", prediction_decomposition_method = None, prediction_decomposition_dimensionality = None, decomposition_methods = ["PCA"], highlight_feature_indices = [], reset_training = False, skip_modelling = False, model_versions = ["all"], analyse = True, evaluation_set_name = "test", analyse_data = False, analyses = ["default"], analysis_level = "normal", fast_analysis = False, export_options = []): # Setup model_versions = parseModelVersions(model_versions) ## Analyses if fast_analysis: analyse = True analyses = ["simple"] analysis_level = "limited" ## Distributions reconstruction_distribution = parseDistribution( reconstruction_distribution) latent_distribution = parseDistribution(latent_distribution) ## Model configuration validation if not skip_modelling: if run_id: run_id = checkRunID(run_id) model_valid, model_errors = validateModelParameters( model_type, latent_distribution, reconstruction_distribution, number_of_reconstruction_classes, parameterise_latent_posterior ) if not model_valid: print("Model configuration is invalid:") for model_error in model_errors: print(" ", model_error) print() if analyse_data: print("Skipping modelling.") print("") skip_modelling = True else: print("Modelling cancelled.") return ## Binarisation binarise_values = False if reconstruction_distribution == "bernoulli": if noisy_preprocessing_methods: if noisy_preprocessing_methods[-1] != "binarise": noisy_preprocessing_methods.append("binarise") print("Appended binarisation method to noisy preprocessing,", "because of the Bernoulli distribution.\n") else: binarise_values = True ## Data sets if not split_data_set or analyse_data or evaluation_set_name == "full" \ or prediction_training_set_name == "full": full_data_set_needed = True else: full_data_set_needed = False # Data print(title("Data")) data_set = data.DataSet( input_file_or_name, directory = data_directory, map_features = map_features, feature_selection = feature_selection, example_filter = example_filter, preprocessing_methods = preprocessing_methods, binarise_values = binarise_values, noisy_preprocessing_methods = noisy_preprocessing_methods ) if full_data_set_needed: data_set.load() if split_data_set: training_set, validation_set, test_set = data_set.split( splitting_method, splitting_fraction) all_data_sets = [data_set, training_set, validation_set, test_set] else: splitting_method = None training_set = data_set validation_set = None test_set = data_set all_data_sets = [data_set] evaluation_set_name = "full" prediction_training_set_name = "full" ## Setup of log and results directories log_directory = data.directory(log_directory, data_set, splitting_method, splitting_fraction) data_results_directory = data.directory(results_directory, data_set, splitting_method, splitting_fraction, preprocessing = False) results_directory = data.directory(results_directory, data_set, splitting_method, splitting_fraction) if temporary_log_directory: main_temporary_log_directory = temporary_log_directory temporary_log_directory = data.directory(temporary_log_directory, data_set, splitting_method, splitting_fraction) ## Data analysis if analyse and analyse_data: print(subtitle("Analysing data")) analysis.analyseData( data_sets = all_data_sets, decomposition_methods = decomposition_methods, highlight_feature_indices = highlight_feature_indices, analyses = analyses, analysis_level = analysis_level, export_options = export_options, results_directory = data_results_directory ) print() ## Full data set clean up if not full_data_set_needed: data_set.clear() # Modelling if skip_modelling: print("Modelling skipped.") return print(title("Modelling")) # Set the number of features for the model feature_size = training_set.number_of_features # Parse numbers of samples number_of_monte_carlo_samples = parseSampleLists( number_of_monte_carlo_samples) number_of_importance_samples = parseSampleLists( number_of_importance_samples) # Use analytical KL term for single-Gaussian-VAE if "VAE" in model_type: if latent_distribution == "gaussian": analytical_kl_term = True else: analytical_kl_term = False # Change latent distribution to Gaussian mixture if not already set if model_type == "GMVAE" and latent_distribution != "gaussian mixture": latent_distribution = "gaussian mixture" print("The latent distribution was changed to", "a Gaussian-mixture model, because of the model chosen.\n") # Set the number of classes if not already set if not number_of_classes: if training_set.has_labels: number_of_classes = training_set.number_of_classes \ - training_set.number_of_excluded_classes elif "mixture" in latent_distribution: raise ValueError( "For a mixture model and a data set without labels, " "the number of classes has to be set." ) else: number_of_classes = 1 print(subtitle("Model setup")) if model_type == "VAE": model = VariationalAutoencoder( feature_size = feature_size, latent_size = latent_size, hidden_sizes = hidden_sizes, number_of_monte_carlo_samples =number_of_monte_carlo_samples, number_of_importance_samples = number_of_importance_samples, analytical_kl_term = analytical_kl_term, inference_architecture = inference_architecture, latent_distribution = latent_distribution, number_of_latent_clusters = number_of_classes, parameterise_latent_posterior = parameterise_latent_posterior, generative_architecture = generative_architecture, reconstruction_distribution = reconstruction_distribution, number_of_reconstruction_classes = number_of_reconstruction_classes, batch_normalisation = batch_normalisation, dropout_keep_probabilities = dropout_keep_probabilities, count_sum = count_sum, number_of_warm_up_epochs = number_of_warm_up_epochs, kl_weight = kl_weight, log_directory = log_directory, results_directory = results_directory ) elif model_type == "GMVAE": if prior_probabilities_method == "uniform": prior_probabilities = None elif prior_probabilities_method == "infer": prior_probabilities = training_set.class_probabilities elif prior_probabilities_method == "literature": prior_probabilities = training_set.literature_probabilities else: prior_probabilities = None if not prior_probabilities: prior_probabilities_method = "uniform" prior_probabilities_values = None else: prior_probabilities_values = list(prior_probabilities.values()) prior_probabilities = { "method": prior_probabilities_method, "values": prior_probabilities_values } model = GaussianMixtureVariationalAutoencoder( feature_size = feature_size, latent_size = latent_size, hidden_sizes = hidden_sizes, number_of_monte_carlo_samples = number_of_monte_carlo_samples, number_of_importance_samples = number_of_importance_samples, analytical_kl_term = analytical_kl_term, prior_probabilities = prior_probabilities, number_of_latent_clusters = number_of_classes, proportion_of_free_KL_nats = proportion_of_free_KL_nats, reconstruction_distribution = reconstruction_distribution, number_of_reconstruction_classes = number_of_reconstruction_classes, batch_normalisation = batch_normalisation, dropout_keep_probabilities = dropout_keep_probabilities, count_sum = count_sum, number_of_warm_up_epochs = number_of_warm_up_epochs, kl_weight = kl_weight, log_directory = log_directory, results_directory = results_directory ) else: raise ValueError("Model type not found: `{}`.".format(model_type)) print(model.description) print() print(model.parameters) print() ## Training print(subtitle("Model training")) status, run_id = model.train( training_set, validation_set, number_of_epochs = number_of_epochs, batch_size = batch_size, learning_rate = learning_rate, plotting_interval = plotting_interval_during_training, run_id = run_id, new_run = new_run, reset_training = reset_training, temporary_log_directory = temporary_log_directory ) # Remove temporary directories created and emptied during training if temporary_log_directory and os.path.exists(main_temporary_log_directory): removeEmptyDirectories(main_temporary_log_directory) if not status["completed"]: print(status["message"]) return status_filename = "status" if "epochs trained" in status: status_filename += "-" + status["epochs trained"] status_path = os.path.join( model.logDirectory(run_id = run_id), status_filename + ".log" ) with open(status_path, "w") as status_file: for status_field, status_value in status.items(): if status_value: status_file.write( status_field + ": " + str(status_value) + "\n" ) print() # Evaluation, prediction, and analysis ## Setup if analyse: if prediction_method: predict_labels_using_model = False elif "GM" in model.type: predict_labels_using_model = True prediction_method = "model" else: predict_labels_using_model = False else: predict_labels_using_model = False evaluation_title_parts = ["evaluation"] if analyse: if prediction_method: evaluation_title_parts.append("prediction") evaluation_title_parts.append("analysis") evaluation_title = enumerateListOfStrings(evaluation_title_parts) print(title(evaluation_title.capitalize())) ### Set selection for data_subset in all_data_sets: clear_subset = True if data_subset.kind == evaluation_set_name: evaluation_set = data_subset clear_subset = False if prediction_method \ and data_subset.kind == prediction_training_set_name: prediction_training_set = data_subset clear_subset = False if clear_subset: data_subset.clear() ### Evaluation set evaluation_subset_indices = analysis.evaluationSubsetIndices( evaluation_set) print("Evaluation set: {} set.".format(evaluation_set.kind)) ### Prediction method if prediction_method: prediction_method = properString( prediction_method, PREDICTION_METHOD_NAMES ) prediction_method_specifications = PREDICTION_METHOD_SPECIFICATIONS\ .get(prediction_method, {}) prediction_method_inference = prediction_method_specifications.get( "inference", None) prediction_method_fixed_number_of_clusters \ = prediction_method_specifications.get( "fixed number of clusters", None) prediction_method_cluster_kind = prediction_method_specifications.get( "cluster kind", None) if prediction_method_fixed_number_of_clusters: number_of_clusters = number_of_classes else: number_of_clusters = None if prediction_method_inference \ and prediction_method_inference == "transductive": prediction_training_set = None prediction_training_set_name = None else: prediction_training_set_name = prediction_training_set.kind prediction_details = { "method": prediction_method, "number_of_classes": number_of_clusters, "training_set_name": prediction_training_set_name, "decomposition_method": prediction_decomposition_method, "decomposition_dimensionality": prediction_decomposition_dimensionality } print("Prediction method: {}.".format(prediction_method)) if number_of_clusters: print("Number of clusters: {}.".format(number_of_clusters)) if prediction_training_set: print("Prediction training set: {} set.".format( prediction_training_set.kind)) prediction_id_parts = [] if prediction_decomposition_method: prediction_decomposition_method = properString( prediction_decomposition_method, DECOMPOSITION_METHOD_NAMES ) if not prediction_decomposition_dimensionality: prediction_decomposition_dimensionality \ = DEFAULT_DECOMPOSITION_DIMENSIONALITY prediction_id_parts += [ prediction_decomposition_method, prediction_decomposition_dimensionality ] prediction_details.update({ "decomposition_method": prediction_decomposition_method, "decomposition_dimensionality": prediction_decomposition_dimensionality }) print("Decomposition method before prediction: {}-d {}.".format( prediction_decomposition_dimensionality, prediction_decomposition_method )) prediction_id_parts.append(prediction_method) if number_of_clusters: prediction_id_parts.append(number_of_clusters) if prediction_training_set \ and prediction_training_set.kind != "training": prediction_id_parts.append(prediction_training_set.kind) prediction_id = "_".join(map( lambda s: normaliseString(str(s)).replace("_", ""), prediction_id_parts )) prediction_details["id"] = prediction_id else: prediction_details = {} ### Model parameter sets model_parameter_set_names = [] if "end_of_training" in model_versions: model_parameter_set_names.append("end of training") if "best_model" in model_versions \ and betterModelExists(model, run_id = run_id): model_parameter_set_names.append("best model") if "early_stopping" in model_versions \ and modelStoppedEarly(model, run_id = run_id): model_parameter_set_names.append("early stopping") print("Model parameter sets: {}.".format(enumerateListOfStrings( model_parameter_set_names))) print() ## Model analysis if analyse: print(subtitle("Model analysis")) analysis.analyseModel( model = model, run_id = run_id, analyses = analyses, analysis_level = analysis_level, export_options = export_options, results_directory = results_directory ) ## Results evaluation, prediction, and analysis for model_parameter_set_name in model_parameter_set_names: if model_parameter_set_name == "best model": use_best_model = True else: use_best_model = False if model_parameter_set_name == "early stopping": use_early_stopping_model = True else: use_early_stopping_model = False model_parameter_set_name = model_parameter_set_name.capitalize() print(subtitle(model_parameter_set_name)) # Evaluation model_parameter_set_name = model_parameter_set_name.replace(" ", "-") print(heading("{} evaluation".format(model_parameter_set_name))) if "VAE" in model.type: transformed_evaluation_set, reconstructed_evaluation_set,\ latent_evaluation_sets = model.evaluate( evaluation_set = evaluation_set, evaluation_subset_indices = evaluation_subset_indices, batch_size = batch_size, predict_labels = predict_labels_using_model, run_id = run_id, use_best_model = use_best_model, use_early_stopping_model = use_early_stopping_model ) else: transformed_evaluation_set, reconstructed_evaluation_set = \ model.evaluate( evaluation_set = evaluation_set, evaluation_subset_indices = evaluation_subset_indices, batch_size = batch_size, run_id = run_id, use_best_model = use_best_model, use_early_stopping_model = use_early_stopping_model ) latent_evaluation_sets = None print() # Prediction if analyse and "VAE" in model.type and prediction_method \ and not transformed_evaluation_set.has_predictions: print(heading("{} prediction".format(model_parameter_set_name))) latent_prediction_evaluation_set = latent_evaluation_sets["z"] if prediction_method_inference \ and prediction_method_inference == "inductive": latent_prediction_training_sets = model.evaluate( evaluation_set = prediction_training_set, batch_size = batch_size, run_id = run_id, use_best_model = use_best_model, use_early_stopping_model = use_early_stopping_model, output_versions = "latent", log_results = False ) latent_prediction_training_set \ = latent_prediction_training_sets["z"] print() else: latent_prediction_training_set = None if prediction_decomposition_method: if latent_prediction_training_set: latent_prediction_training_set, \ latent_prediction_evaluation_set \ = data.decomposeDataSubsets( latent_prediction_training_set, latent_prediction_evaluation_set, method = prediction_decomposition_method, number_of_components = prediction_decomposition_dimensionality, random = True ) else: latent_prediction_evaluation_set \ = data.decomposeDataSubsets( latent_prediction_evaluation_set, method = prediction_decomposition_method, number_of_components = prediction_decomposition_dimensionality, random = True ) print() cluster_ids, predicted_labels, predicted_superset_labels \ = predict( latent_prediction_training_set, latent_prediction_evaluation_set, prediction_method, number_of_clusters ) transformed_evaluation_set.updatePredictions( predicted_cluster_ids = cluster_ids, predicted_labels = predicted_labels, predicted_superset_labels = predicted_superset_labels ) reconstructed_evaluation_set.updatePredictions( predicted_cluster_ids = cluster_ids, predicted_labels = predicted_labels, predicted_superset_labels = predicted_superset_labels ) for variable in latent_evaluation_sets: latent_evaluation_sets[variable].updatePredictions( predicted_cluster_ids = cluster_ids, predicted_labels = predicted_labels, predicted_superset_labels = predicted_superset_labels ) print() # Analysis if analyse: print(heading("{} results analysis".format(model_parameter_set_name))) analysis.analyseResults( evaluation_set = transformed_evaluation_set, reconstructed_evaluation_set = reconstructed_evaluation_set, latent_evaluation_sets = latent_evaluation_sets, model = model, run_id = run_id, decomposition_methods = decomposition_methods, evaluation_subset_indices = evaluation_subset_indices, highlight_feature_indices = highlight_feature_indices, prediction_details = prediction_details, best_model = use_best_model, early_stopping = use_early_stopping_model, analyses = analyses, analysis_level = analysis_level, export_options = export_options, results_directory = results_directory ) # Clean up if transformed_evaluation_set.version == "original": transformed_evaluation_set.resetPredictions()
def main(data_name, cluster_name, splitting_method = "random", splitting_fraction = 0.8, filtering_method = None, feature_selection = None, feature_size = None, latent_sizes = None, hidden_structure = None, reconstruction_distributions = None, numbers_of_reconstruction_classes = [0], use_count_sum = False, numbers_of_epochs = 10, batch_size = 100, learning_rate = 1e-3, force_training = False): random.seed(42) # Data clusters = data.loadClusterData(cluster_name) (training_set, training_headers), (validation_set, validation_headers), \ (test_set, test_headers) = data.loadCountData(data_name, splitting_method, splitting_fraction, feature_selection, feature_size, filtering_method, clusters) # print("") # # data_set_base_name = data.dataSetBaseName(splitting_method, splitting_fraction, # filtering_method, feature_selection, feature_size) # # data_sets = {"training": training_set, # "validation": validation_set, # "test": test_set} # # analysis.analyseData(data_sets, name = data_set_base_name) metadata = { "filtering method": filtering_method, "splitting method": splitting_method, "splitting fraction": splitting_fraction, "feature selection": feature_selection, "feature size": training_set.shape[1], "training size": training_set.shape[0], "validation size": validation_set.shape[0], "test size": test_set.shape[0] } print("") # Loop feature_size = training_set.shape[1] if not hidden_structure: hidden_structure = [feature_size / 10] if not latent_sizes: latent_sizes = [feature_size / 100] for latent_size, reconstruction_distribution, number_of_reconstruction_classes, \ number_of_epochs in product(latent_sizes, reconstruction_distributions, \ numbers_of_reconstruction_classes, numbers_of_epochs): if reconstruction_distribution == "bernoulli": if use_count_sum: print("Can't use count sum with Bernoulli distribution.\n") continue if number_of_reconstruction_classes > 0: print("Can't use reconstruction classification with Bernoulli distribution.\n") continue if "zero_inflated" in reconstruction_distribution: if number_of_reconstruction_classes > 0: print("Can't use reconstruction classification with zero-inflated distributions.\n") continue # Model model_name = data.modelName("VAE", filtering_method, feature_selection, feature_size, splitting_method, splitting_fraction, reconstruction_distribution, number_of_reconstruction_classes, use_count_sum, latent_size, hidden_structure, learning_rate, batch_size, number_of_epochs) model = modeling.VariationalAutoEncoderForCounts(feature_size, latent_size, hidden_structure, reconstruction_distribution, number_of_reconstruction_classes, use_count_sum) previous_model_name, epochs_still_to_train = \ data.findPreviouslyTrainedModel(model_name) print("") if previous_model_name and not force_training: model.load(previous_model_name) if epochs_still_to_train > 0: print("") model.train(training_set, validation_set, N_epochs = epochs_still_to_train, batch_size = batch_size, learning_rate = learning_rate) model.save(name = model_name, metadata = metadata) else: model.train(training_set, validation_set, N_epochs = number_of_epochs, batch_size = batch_size, learning_rate = learning_rate) model.save(name = model_name, metadata = metadata) print("") # Analysis analysis.analyseModel(model, name = model_name) print("") test_set_transformed, reconstructed_test_set, latent_set, sample_set, test_metrics = \ model.evaluate(test_set) print("") analysis.analyseResults(test_set_transformed, reconstructed_test_set, test_headers, clusters, latent_set, sample_set, name = model_name, intensive_calculations = True) print("")