def analyse_predictions(evaluation_set, analyses_directory=None): if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] print("Saving predictions.") predictions_directory = os.path.join(analyses_directory, "predictions") table_name = "predictions" if evaluation_set.prediction_specifications: table_name += "-" + (evaluation_set.prediction_specifications.name) else: table_name += "-unknown_prediction_method" if evaluation_set.has_predicted_cluster_ids: saving_time_start = time() save_values(values=evaluation_set.predicted_cluster_ids, name="{}-predicted_cluster_ids".format(table_name), row_names=evaluation_set.example_names, column_names=["Cluster ID"], directory=predictions_directory) saving_duration = time() - saving_time_start print(" Predicted cluster IDs saved ({}).".format( format_duration(saving_duration))) if evaluation_set.has_predicted_labels: saving_time_start = time() save_values(values=evaluation_set.predicted_labels, name="{}-predicted_labels".format(table_name), row_names=evaluation_set.example_names, column_names=[evaluation_set.terms["class"].capitalize()], directory=predictions_directory) saving_duration = time() - saving_time_start print(" Predicted labels saved ({}).".format( format_duration(saving_duration))) if evaluation_set.has_predicted_superset_labels: saving_time_start = time() save_values(values=evaluation_set.predicted_superset_labels, name="{}-predicted_superset_labels".format(table_name), row_names=evaluation_set.example_names, column_names=[evaluation_set.terms["class"].capitalize()], directory=predictions_directory) saving_duration = time() - saving_time_start print(" Predicted superset labels saved ({}).".format( format_duration(saving_duration))) print()
def load_original_data_set(paths, data_format): print("Loading original data set.") loading_time_start = time() if data_format is None: raise ValueError("Data format not specified.") elif data_format.startswith("tsv"): data_format = "matrix_ebf" load = LOADERS.get(data_format) if load is None: raise ValueError( "Data format `{}` not recognised.".format(data_format)) data_dictionary = load(paths=paths) loading_duration = time() - loading_time_start print("Original data set loaded ({}).".format( format_duration(loading_duration))) if not isinstance(data_dictionary["values"], scipy.sparse.csr_matrix): print() print("Converting data set value array to sparse matrix.") sparse_time_start = time() data_dictionary["values"] = scipy.sparse.csr_matrix( data_dictionary["values"]) sparse_duration = time() - sparse_time_start print("Data set value array converted ({}).".format( format_duration(sparse_duration))) return data_dictionary
def load_data_dictionary(path): def load(tables_file, group=None): if not group: group = tables_file.root data_dictionary = {} for node in tables_file.iter_nodes(group): node_title = node._v_title if node == group: pass elif isinstance(node, tables.Group): if node_title.endswith("set"): data_dictionary[node_title] = load(tables_file, group=node) elif node_title.endswith("values"): data_dictionary[node_title] = _load_sparse_matrix( tables_file, group=node) elif node_title == "split indices": data_dictionary[node_title] = _load_split_indices( tables_file, group=node) elif node_title == "feature mapping": data_dictionary[node_title] = _load_feature_mapping( tables_file, group=node) else: raise NotImplementedError( "Loading group `{}` not implemented.".format( node_title)) elif isinstance(node, tables.Array): data_dictionary[node_title] = _load_array_or_other_type(node) else: raise NotImplementedError( "Loading node `{}` not implemented.".format(node_title)) return data_dictionary start_time = time() with tables.open_file(path, "r") as tables_file: data_dictionary = load(tables_file) duration = time() - start_time print("Data loaded ({}).".format(format_duration(duration))) return data_dictionary
def save_data_dictionary(data_dictionary, path): directory, filename = os.path.split(path) if not os.path.exists(directory): os.makedirs(directory) def save(data_dictionary, tables_file, group_title=None): if group_title: group = tables_file.create_group("/", normalise_string(group_title), group_title) else: group = tables_file.root for title, value in data_dictionary.items(): if isinstance(value, scipy.sparse.csr_matrix): _save_sparse_matrix(value, title, group, tables_file) elif isinstance(value, (numpy.ndarray, list)): _save_array(value, title, group, tables_file) elif title == "split indices": _save_split_indices(value, title, group, tables_file) elif title == "feature mapping": _save_feature_mapping(value, title, group, tables_file) elif value is None: _save_string(str(value), title, group, tables_file) elif title.endswith("set"): save(value, tables_file, group_title=title) else: raise NotImplementedError( "Saving type {} for title \"{}\" has not been implemented." .format(type(value), title)) start_time = time() filters = tables.Filters(complib="zlib", complevel=5) with tables.open_file(path, "w", filters=filters) as tables_file: save(data_dictionary, tables_file) duration = time() - start_time print("Data saved ({}).".format(format_duration(duration)))
def analyse_centroid_probabilities(centroids, name=None, analysis_level=None, export_options=None, analyses_directory=None): if name: name = normalise_string(name) if analysis_level is None: analysis_level = defaults["analyses"]["analysis_level"] if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] print("Plotting centroid probabilities.") plot_time_start = time() posterior_probabilities = None prior_probabilities = None if "posterior" in centroids and centroids["posterior"]: posterior_probabilities = centroids["posterior"]["probabilities"] n_centroids = len(posterior_probabilities) if "prior" in centroids and centroids["prior"]: prior_probabilities = centroids["prior"]["probabilities"] n_centroids = len(prior_probabilities) centroids_palette = style.darker_palette(n_centroids) x_label = "$k$" if prior_probabilities is not None: if posterior_probabilities is not None: y_label = _axis_label_for_symbol( symbol="\\pi", distribution=normalise_string("posterior"), suffix="^k") if name: plot_name = [name, "posterior", "prior"] else: plot_name = ["posterior", "prior"] else: y_label = _axis_label_for_symbol( symbol="\\pi", distribution=normalise_string("prior"), suffix="^k") if name: plot_name = [name, "prior"] else: plot_name = "prior" elif posterior_probabilities is not None: y_label = _axis_label_for_symbol( symbol="\\pi", distribution=normalise_string("posterior"), suffix="^k") if name: plot_name = [name, "posterior"] else: plot_name = "posterior" figure, figure_name = figures.plot_probabilities(posterior_probabilities, prior_probabilities, x_label=x_label, y_label=y_label, palette=centroids_palette, uniform=False, name=plot_name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=analyses_directory) plot_duration = time() - plot_time_start print("Centroid probabilities plotted and saved ({}).".format( format_duration(plot_duration)))
def analyse_decompositions(data_sets, other_data_sets=None, centroids=None, colouring_data_set=None, sampled_data_set=None, decomposition_methods=None, highlight_feature_indices=None, symbol=None, title="data set", specifier=None, analysis_level=None, export_options=None, analyses_directory=None): if analysis_level is None: analysis_level = defaults["analyses"]["analysis_level"] centroids_original = centroids if isinstance(data_sets, dict): data_sets = list(data_sets.values()) if not isinstance(data_sets, (list, tuple)): data_sets = [data_sets] if other_data_sets is None: other_data_sets = [None] * len(data_sets) elif not isinstance(other_data_sets, (list, tuple)): other_data_sets = [other_data_sets] if len(data_sets) != len(other_data_sets): raise ValueError( "Lists of data sets and alternative data sets do not have the " "same length.") specification = None base_symbol = symbol original_title = title if decomposition_methods is None: decomposition_methods = [defaults["decomposition_method"]] elif not isinstance(decomposition_methods, (list, tuple)): decomposition_methods = [decomposition_methods] else: decomposition_methods = decomposition_methods.copy() decomposition_methods.insert(0, None) if highlight_feature_indices is None: highlight_feature_indices = defaults["analyses"][ "highlight_feature_indices"] elif not isinstance(highlight_feature_indices, (list, tuple)): highlight_feature_indices = [highlight_feature_indices] else: highlight_feature_indices = highlight_feature_indices.copy() if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] for data_set, other_data_set in zip(data_sets, other_data_sets): if data_set.values.shape[1] <= 1: continue title = original_title name = normalise_string(title) if specifier: specification = specifier(data_set) if specification: name += "-" + str(specification) title += " for " + specification title += " set" if not colouring_data_set: colouring_data_set = data_set if data_set.version in ["z", "z1"]: centroids = copy.deepcopy(centroids_original) else: centroids = None if other_data_set: title = "{} set values in {}".format(other_data_set.version, title) name = other_data_set.version + "-" + name decompositions_directory = os.path.join(analyses_directory, name) for decomposition_method in decomposition_methods: other_values = None sampled_values = None if other_data_set: other_values = other_data_set.values if sampled_data_set: sampled_values = sampled_data_set.values if not decomposition_method: if data_set.number_of_features == 2: values_decomposed = data_set.values other_values_decomposed = other_values sampled_values_decomposed = sampled_values centroids_decomposed = centroids else: continue else: decomposition_method = proper_string( decomposition_method, DECOMPOSITION_METHOD_NAMES) values_decomposed = data_set.values other_values_decomposed = other_values sampled_values_decomposed = sampled_values centroids_decomposed = centroids other_value_sets_decomposed = {} if other_values is not None: other_value_sets_decomposed["other"] = other_values if sampled_values is not None: other_value_sets_decomposed["sampled"] = sampled_values if not other_value_sets_decomposed: other_value_sets_decomposed = None if decomposition_method == "t-SNE": if (data_set.number_of_examples > MAXIMUM_NUMBER_OF_EXAMPLES_FOR_TSNE): print( "The number of examples for {}".format(title), "is too large to decompose it", "using {}. Skipping.".format(decomposition_method)) print() continue elif (data_set.number_of_features > MAXIMUM_NUMBER_OF_FEATURES_FOR_TSNE): number_of_pca_components_before_tsne = min( MAXIMUM_NUMBER_OF_PCA_COMPONENTS_BEFORE_TSNE, data_set.number_of_examples - 1) print( "The number of features for {}".format(title), "is too large to decompose it", "using {} in due time.".format( decomposition_method)) print("Decomposing {} to {} components using PCA " "beforehand.".format( title, number_of_pca_components_before_tsne)) decompose_time_start = time() (values_decomposed, other_value_sets_decomposed, centroids_decomposed) = decompose( values_decomposed, other_value_sets=other_value_sets_decomposed, centroids=centroids_decomposed, method="pca", number_of_components=( number_of_pca_components_before_tsne)) decompose_duration = time() - decompose_time_start print("{} pre-decomposed ({}).".format( capitalise_string(title), format_duration(decompose_duration))) else: if scipy.sparse.issparse(values_decomposed): values_decomposed = values_decomposed.A if scipy.sparse.issparse(other_values_decomposed): other_values_decomposed = other_values_decomposed.A if scipy.sparse.issparse(sampled_values_decomposed): sampled_values_decomposed = ( sampled_values_decomposed.A) print("Decomposing {} using {}.".format( title, decomposition_method)) decompose_time_start = time() (values_decomposed, other_value_sets_decomposed, centroids_decomposed) = decompose( values_decomposed, other_value_sets=other_value_sets_decomposed, centroids=centroids_decomposed, method=decomposition_method, number_of_components=2) decompose_duration = time() - decompose_time_start print("{} decomposed ({}).".format( capitalise_string(title), format_duration(decompose_duration))) print() if other_value_sets_decomposed: other_values_decomposed = other_value_sets_decomposed.get( "other") sampled_values_decomposed = ( other_value_sets_decomposed.get("sampled")) if base_symbol: symbol = base_symbol else: symbol = specification x_label = _axis_label_for_symbol( symbol=symbol, coordinate=1, decomposition_method=decomposition_method, ) y_label = _axis_label_for_symbol( symbol=symbol, coordinate=2, decomposition_method=decomposition_method, ) figure_labels = { "title": decomposition_method, "x label": x_label, "y label": y_label } if other_data_set: plot_values_decomposed = other_values_decomposed else: plot_values_decomposed = values_decomposed if plot_values_decomposed is None: print("No values to plot.\n") return print("Plotting {}{}.".format( "decomposed " if decomposition_method else "", title)) # No colour-coding plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Samples if sampled_data_set: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, centroids=centroids_decomposed, sampled_values=sampled_values_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with samples) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Labels if colouring_data_set.labels is not None: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with labels) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Superset labels if colouring_data_set.superset_labels is not None: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="superset labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with superset labels) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) # For each class if analysis_level == "extensive": if colouring_data_set.number_of_classes <= 10: plot_time_start = time() for class_name in colouring_data_set.class_names: figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="class", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, class_name=class_name, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure( figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " {} (for each class) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) if (colouring_data_set.superset_labels is not None and data_set.number_of_superset_classes <= 10): plot_time_start = time() for superset_class_name in ( colouring_data_set.superset_class_names): figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="superset class", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, class_name=superset_class_name, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure( figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (for each superset class) plotted and " "saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Batches if colouring_data_set.has_batches: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="batches", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with batches) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Cluster IDs if colouring_data_set.has_predicted_cluster_ids: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted cluster IDs", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " " "{} (with predicted cluster IDs) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) # Predicted labels if colouring_data_set.has_predicted_labels: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with predicted labels) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) if colouring_data_set.has_predicted_superset_labels: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted superset labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " {} (with predicted superset labels) plotted and saved" " ({}).".format(capitalise_string(title), format_duration(plot_duration))) # Count sum plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="count sum", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with count sum) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Features for feature_index in highlight_feature_indices: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="feature", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, feature_index=feature_index, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with {}) plotted and saved ({}).".format( capitalise_string(title), data_set.feature_names[feature_index], format_duration(plot_duration))) print()
def analyse_distributions(data_set, colouring_data_set=None, cutoffs=None, preprocessed=False, analysis_level="normal", export_options=None, analyses_directory=None): if not colouring_data_set: colouring_data_set = data_set if analysis_level is None: analysis_level = defaults["analyses"]["analysis_level"] if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] distribution_directory = os.path.join(analyses_directory, "histograms") data_set_title = data_set.kind + " set" data_set_name = data_set.kind if data_set.version != "original": data_set_title = data_set.version + " " + data_set_title data_set_name = None data_set_discreteness = data_set.discreteness and not preprocessed print("Plotting distributions for {}.".format(data_set_title)) # Class distribution if (data_set.number_of_classes and data_set.number_of_classes < 100 and colouring_data_set == data_set): distribution_time_start = time() figure, figure_name = figures.plot_class_histogram( labels=data_set.labels, class_names=data_set.class_names, class_palette=data_set.class_palette, normed=True, scale="linear", label_sorter=data_set.label_sorter, name=data_set_name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=distribution_directory) distribution_duration = time() - distribution_time_start print(" Class distribution plotted and saved ({}).".format( format_duration(distribution_duration))) # Superset class distribution if data_set.label_superset and colouring_data_set == data_set: distribution_time_start = time() figure, figure_name = figures.plot_class_histogram( labels=data_set.superset_labels, class_names=data_set.superset_class_names, class_palette=data_set.superset_class_palette, normed=True, scale="linear", label_sorter=data_set.superset_label_sorter, name=[data_set_name, "superset"]) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=distribution_directory) distribution_duration = time() - distribution_time_start print(" Superset class distribution plotted and saved ({}).".format( format_duration(distribution_duration))) # Count distribution if scipy.sparse.issparse(data_set.values): series = data_set.values.data excess_zero_count = data_set.values.size - series.size else: series = data_set.values.reshape(-1) excess_zero_count = 0 distribution_time_start = time() for x_scale in ["linear", "log"]: figure, figure_name = figures.plot_histogram( series=series, excess_zero_count=excess_zero_count, label=data_set.tags["value"].capitalize() + "s", discrete=data_set_discreteness, normed=True, x_scale=x_scale, y_scale="log", name=["counts", data_set_name]) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=distribution_directory) distribution_duration = time() - distribution_time_start print(" Count distribution plotted and saved ({}).".format( format_duration(distribution_duration))) # Count distributions with cut-off if (analysis_level == "extensive" and cutoffs and data_set.example_type == "counts"): distribution_time_start = time() for cutoff in cutoffs: figure, figure_name = figures.plot_cutoff_count_histogram( series=series, excess_zero_count=excess_zero_count, cutoff=cutoff, normed=True, scale="log", name=data_set_name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=distribution_directory + "-counts") distribution_duration = time() - distribution_time_start print(" Count distributions with cut-offs plotted and saved ({}).". format(format_duration(distribution_duration))) # Count sum distribution distribution_time_start = time() figure, figure_name = figures.plot_histogram( series=data_set.count_sum, label="Total number of {}s per {}".format(data_set.tags["item"], data_set.tags["example"]), normed=True, y_scale="log", name=["count sum", data_set_name]) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=distribution_directory) distribution_duration = time() - distribution_time_start print(" Count sum distribution plotted and saved ({}).".format( format_duration(distribution_duration))) # Count distributions and count sum distributions for each class if analysis_level == "extensive" and colouring_data_set.labels is not None: class_count_distribution_directory = distribution_directory if data_set.version == "original": class_count_distribution_directory += "-classes" if colouring_data_set.label_superset: labels = colouring_data_set.superset_labels class_names = colouring_data_set.superset_class_names class_palette = colouring_data_set.superset_class_palette label_sorter = colouring_data_set.superset_label_sorter else: labels = colouring_data_set.labels class_names = colouring_data_set.class_names class_palette = colouring_data_set.class_palette label_sorter = colouring_data_set.label_sorter if not class_palette: index_palette = style.lighter_palette( colouring_data_set.number_of_classes) class_palette = { class_name: index_palette[i] for i, class_name in enumerate( sorted(class_names, key=label_sorter)) } distribution_time_start = time() for class_name in class_names: class_indices = labels == class_name if not class_indices.any(): continue values_label = data_set.values[class_indices] if scipy.sparse.issparse(values_label): series = values_label.data excess_zero_count = values_label.size - series.size else: series = data_set.values.reshape(-1) excess_zero_count = 0 figure, figure_name = figures.plot_histogram( series=series, excess_zero_count=excess_zero_count, label=data_set.tags["value"].capitalize() + "s", discrete=data_set_discreteness, normed=True, y_scale="log", colour=class_palette[class_name], name=["counts", data_set_name, "class", class_name]) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=class_count_distribution_directory) distribution_duration = time() - distribution_time_start print(" Count distributions for each class plotted and saved ({}).". format(format_duration(distribution_duration))) distribution_time_start = time() for class_name in class_names: class_indices = labels == class_name if not class_indices.any(): continue figure, figure_name = figures.plot_histogram( series=data_set.count_sum[class_indices], label="Total number of {}s per {}".format( data_set.tags["item"], data_set.tags["example"]), normed=True, y_scale="log", colour=class_palette[class_name], name=["count sum", data_set_name, "class", class_name]) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=class_count_distribution_directory) distribution_duration = time() - distribution_time_start print(" " "Count sum distributions for each class plotted and saved ({}).". format(format_duration(distribution_duration))) print()
def analyse_matrices(data_set, plot_distances=False, name=None, export_options=None, analyses_directory=None): if plot_distances: base_name = "distances" else: base_name = "heat_maps" if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] analyses_directory = os.path.join(analyses_directory, base_name) if not name: name = [] elif not isinstance(name, list): name = [name] name.insert(0, base_name) # Subsampling indices (if necessary) random_state = numpy.random.RandomState(57) shuffled_indices = random_state.permutation(data_set.number_of_examples) # Feature selection for plotting (if necessary) feature_indices_for_plotting = None if (not plot_distances and data_set.number_of_features > MAXIMUM_NUMBER_OF_FEATURES_FOR_HEAT_MAPS): feature_variances = data_set.values.var(axis=0) if isinstance(feature_variances, numpy.matrix): feature_variances = feature_variances.A.squeeze() feature_indices_for_plotting = numpy.argsort( feature_variances)[-MAXIMUM_NUMBER_OF_FEATURES_FOR_HEAT_MAPS:] feature_indices_for_plotting.sort() # Class palette class_palette = data_set.class_palette if data_set.labels is not None and not class_palette: index_palette = style.lighter_palette(data_set.number_of_classes) class_palette = { class_name: tuple(index_palette[i]) for i, class_name in enumerate( sorted(data_set.class_names, key=data_set.label_sorter)) } # Axis labels example_label = data_set.tags["example"].capitalize() + "s" feature_label = data_set.tags["feature"].capitalize() + "s" value_label = data_set.tags["value"].capitalize() + "s" version = data_set.version symbol = None value_name = "values" if version in ["z", "x"]: symbol = "$\\mathbf{{{}}}$".format(version) value_name = "component" elif version in ["y"]: symbol = "${}$".format(version) value_name = "value" if version in ["y", "z"]: feature_label = " ".join([symbol, value_name + "s"]) if plot_distances: if version in ["y", "z"]: value_label = symbol else: value_label = version if feature_indices_for_plotting is not None: feature_label = "{} most varying {}".format( len(feature_indices_for_plotting), feature_label.lower()) plot_string = "Plotting heat map for {} values." if plot_distances: plot_string = "Plotting pairwise distances in {} space." print(plot_string.format(data_set.version)) sorting_methods = ["hierarchical_clustering"] if data_set.labels is not None: sorting_methods.insert(0, "labels") for sorting_method in sorting_methods: distance_metrics = [None] if plot_distances or sorting_method == "hierarchical_clustering": distance_metrics = ["Euclidean", "cosine"] for distance_metric in distance_metrics: start_time = time() if (sorting_method == "hierarchical_clustering" and data_set.number_of_examples > MAXIMUM_NUMBER_OF_EXAMPLES_FOR_DENDROGRAM): sample_size = MAXIMUM_NUMBER_OF_EXAMPLES_FOR_DENDROGRAM elif (data_set.number_of_examples > MAXIMUM_NUMBER_OF_EXAMPLES_FOR_HEAT_MAPS): sample_size = MAXIMUM_NUMBER_OF_EXAMPLES_FOR_HEAT_MAPS else: sample_size = None indices = numpy.arange(data_set.number_of_examples) if sample_size: indices = shuffled_indices[:sample_size] example_label = "{} randomly sampled {}".format( sample_size, data_set.tags["example"] + "s") figure, figure_name = figures.plot_matrix( feature_matrix=data_set.values[indices], plot_distances=plot_distances, example_label=example_label, feature_label=feature_label, value_label=value_label, sorting_method=sorting_method, distance_metric=distance_metric, labels=(data_set.labels[indices] if data_set.labels is not None else None), label_kind=data_set.tags["class"], class_palette=class_palette, feature_indices_for_plotting=feature_indices_for_plotting, name_parts=name + [data_set.version, distance_metric, sorting_method]) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=analyses_directory) duration = time() - start_time plot_kind_string = "Heat map for {} values".format( data_set.version) if plot_distances: plot_kind_string = "{} distances in {} space".format( distance_metric.capitalize(), data_set.version) subsampling_string = "" if sample_size: subsampling_string = "{} {} randomly sampled examples".format( "for" if plot_distances else "of", sample_size) sort_string = "sorted using {}".format( sorting_method.replace("_", " ")) if (not plot_distances and sorting_method == "hierarchical_clustering"): sort_string += " (with {} distances)".format(distance_metric) print(" " + " ".join([ s for s in [ plot_kind_string, subsampling_string, sort_string, "plotted and saved", "({})".format( format_duration(duration)) ] if s ]) + ".") print()
def predict_labels(training_set, evaluation_set, specifications=None, method=None, number_of_clusters=None): if specifications is None: if method is None: method = defaults["evaluation"]["prediction_method"] specifications = PredictionSpecifications( method=method, number_of_clusters=number_of_clusters, training_set=training_set.kind) method = specifications.method number_of_clusters = specifications.number_of_clusters predict = PREDICTION_METHODS[specifications.method]["function"] print("Predicting labels for evaluation set using {} with {} components.". format(method, number_of_clusters)) prediction_time_start = time() if evaluation_set.has_labels: class_names_to_class_ids = numpy.vectorize( lambda class_name: evaluation_set.class_name_to_class_id[class_name ]) class_ids_to_class_names = numpy.vectorize( lambda class_id: evaluation_set.class_id_to_class_name[class_id]) evaluation_label_ids = class_names_to_class_ids(evaluation_set.labels) if evaluation_set.excluded_classes: excluded_class_ids = class_names_to_class_ids( evaluation_set.excluded_classes) else: excluded_class_ids = [] if evaluation_set.has_superset_labels: superset_class_names_to_superset_class_ids = numpy.vectorize( lambda superset_class_name: evaluation_set. superset_class_name_to_superset_class_id[superset_class_name]) superset_class_ids_to_superset_class_names = numpy.vectorize( lambda superset_class_id: evaluation_set. superset_class_id_to_superset_class_name[superset_class_id]) evaluation_superset_label_ids = ( superset_class_names_to_superset_class_ids( evaluation_set.superset_labels)) if evaluation_set.excluded_superset_classes: excluded_superset_class_ids = ( superset_class_names_to_superset_class_ids( evaluation_set.excluded_superset_classes)) else: excluded_superset_class_ids = [] cluster_ids, predicted_labels, predicted_superset_labels = predict( training_set=training_set, evaluation_set=evaluation_set, number_of_clusters=number_of_clusters) if cluster_ids is not None: if predicted_labels is None and evaluation_set.has_labels: predicted_label_ids = map_cluster_ids_to_label_ids( evaluation_label_ids, cluster_ids, excluded_class_ids) predicted_labels = class_ids_to_class_names(predicted_label_ids) if (predicted_superset_labels is None and evaluation_set.has_superset_labels): predicted_superset_label_ids = map_cluster_ids_to_label_ids( evaluation_superset_label_ids, cluster_ids, excluded_superset_class_ids) predicted_superset_labels = ( superset_class_ids_to_superset_class_names( predicted_superset_label_ids)) prediction_duration = time() - prediction_time_start print("Labels predicted ({}).".format( format_duration(prediction_duration))) return cluster_ids, predicted_labels, predicted_superset_labels
def acquire_data_set(title, urls, directory): paths = {} if not urls: return paths if not os.path.exists(directory): os.makedirs(directory) for values_or_labels in urls: paths[values_or_labels] = {} for kind in urls[values_or_labels]: url = urls[values_or_labels][kind] if not url: paths[values_or_labels][kind] = None continue url_filename = os.path.split(url)[-1] file_extension = extension(url_filename) filename = "-".join( map(normalise_string, [title, values_or_labels, kind])) path = os.path.join(directory, filename) + file_extension paths[values_or_labels][kind] = path if not os.path.isfile(path): if url.startswith("."): raise Exception( "Data set file have to be manually placed in " "correct folder.") if os.path.isfile(url): print("Copying {} for {} set.".format( values_or_labels, kind, title)) start_time = time() copy_file(url, path) duration = time() - start_time print("Data set copied ({}).".format( format_duration(duration))) print() else: print("Downloading {} for {} set.".format( values_or_labels, kind, title)) start_time = time() download_file(url, path) duration = time() - start_time print("Data set downloaded ({}).".format( format_duration(duration))) print() return paths
def select_features(values_dictionary, feature_names, method=None, parameters=None): method = normalise_string(method) print("Selecting features.") start_time = time() if type(values_dictionary) == dict: values = values_dictionary["original"] n_examples, n_features = values.shape if method == "remove_zeros": total_feature_sum = values.sum(axis=0) if isinstance(total_feature_sum, numpy.matrix): total_feature_sum = total_feature_sum.A.squeeze() indices = total_feature_sum != 0 elif method == "keep_variances_above": variances = values.var(axis=0) if isinstance(variances, numpy.matrix): variances = variances.A.squeeze() if parameters: threshold = float(parameters[0]) else: threshold = 0.5 indices = variances > threshold elif method == "keep_highest_variances": variances = values.var(axis=0) if isinstance(variances, numpy.matrix): variances = variances.A.squeeze() variance_sorted_indices = numpy.argsort(variances) if parameters: number_to_keep = int(parameters[0]) else: number_to_keep = int(n_examples/2) indices = numpy.sort(variance_sorted_indices[-number_to_keep:]) else: raise ValueError( "Feature selection `{}` not found.".format(method)) if method: error = Exception( "No features excluded using feature selection {}.".format(method)) if indices.dtype == "bool" and all(indices): raise error elif indices.dtype != "bool" and len(indices) == n_features: raise error feature_selected_values = {} for version, values in values_dictionary.items(): if values is not None: feature_selected_values[version] = values[:, indices] else: feature_selected_values[version] = None feature_selected_feature_names = feature_names[indices] n_features_changed = len(feature_selected_feature_names) duration = time() - start_time print("{} features selected, {} excluded ({}).".format( n_features_changed, n_features - n_features_changed, format_duration(duration) )) return feature_selected_values, feature_selected_feature_names
def split_data_set(data_dictionary, method=None, fraction=None): if method is None: method = defaults["data"]["splitting_method"] if fraction is None: fraction = defaults["data"]["splitting_fraction"] print("Splitting data set.") start_time = time() if method == "default": if "split indices" in data_dictionary: method = "indices" else: method = "random" method = normalise_string(method) n = data_dictionary["values"].shape[0] random_state = numpy.random.RandomState(42) if method in ["random", "sequential"]: n_training_validation = int(fraction * n) n_training = int(fraction * n_training_validation) if method == "random": indices = random_state.permutation(n) else: indices = numpy.arange(n) training_indices = indices[:n_training] validation_indices = indices[n_training:n_training_validation] test_indices = indices[n_training_validation:] elif method == "indices": split_indices = data_dictionary["split indices"] training_indices = split_indices["training"] test_indices = split_indices["test"] if "validation" in split_indices: validation_indices = split_indices["validation"] else: n_training_validation = training_indices.stop n_all = test_indices.stop n_training = n_training_validation - ( n_all - n_training_validation) training_indices = slice(n_training) validation_indices = slice(n_training, n_training_validation) elif method == "macosko": values = data_dictionary["values"] minimum_number_of_non_zero_elements = 900 number_of_non_zero_elements = (values != 0).sum(axis=1) training_indices = numpy.nonzero( number_of_non_zero_elements > minimum_number_of_non_zero_elements )[0] test_validation_indices = numpy.nonzero( number_of_non_zero_elements <= minimum_number_of_non_zero_elements )[0] random_state.shuffle(test_validation_indices) n_validation_test = len(test_validation_indices) n_validation = int((1 - fraction) * n_validation_test) validation_indices = test_validation_indices[:n_validation] test_indices = test_validation_indices[n_validation:] else: raise ValueError("Splitting method `{}` not found.".format(method)) split_data_dictionary = { "training set": { "values": data_dictionary["values"][training_indices], "preprocessed values": None, "binarised values": None, "labels": None, "example names": data_dictionary["example names"][training_indices], "batch indices": None }, "validation set": { "values": data_dictionary["values"][validation_indices], "preprocessed values": None, "binarised values": None, "labels": None, "example names": data_dictionary["example names"][validation_indices], "batch indices": None }, "test set": { "values": data_dictionary["values"][test_indices], "preprocessed values": None, "binarised values": None, "labels": None, "example names": data_dictionary["example names"][test_indices], "batch indices": None }, "feature names": data_dictionary["feature names"], "class names": data_dictionary["class names"] } if "labels" in data_dictionary and data_dictionary["labels"] is not None: split_data_dictionary["training set"]["labels"] = ( data_dictionary["labels"][training_indices]) split_data_dictionary["validation set"]["labels"] = ( data_dictionary["labels"][validation_indices]) split_data_dictionary["test set"]["labels"] = ( data_dictionary["labels"][test_indices]) if ("preprocessed values" in data_dictionary and data_dictionary["preprocessed values"] is not None): split_data_dictionary["training set"]["preprocessed values"] = ( data_dictionary["preprocessed values"][training_indices]) split_data_dictionary["validation set"]["preprocessed values"] = ( data_dictionary["preprocessed values"][validation_indices]) split_data_dictionary["test set"]["preprocessed values"] = ( data_dictionary["preprocessed values"][test_indices]) if ("binarised values" in data_dictionary and data_dictionary["binarised values"] is not None): split_data_dictionary["training set"]["binarised values"] = ( data_dictionary["binarised values"][training_indices]) split_data_dictionary["validation set"]["binarised values"] = ( data_dictionary["binarised values"][validation_indices]) split_data_dictionary["test set"]["binarised values"] = ( data_dictionary["binarised values"][test_indices]) if ("batch indices" in data_dictionary and data_dictionary["batch indices"] is not None): split_data_dictionary["training set"]["batch indices"] = ( data_dictionary["batch indices"][training_indices]) split_data_dictionary["validation set"]["batch indices"] = ( data_dictionary["batch indices"][validation_indices]) split_data_dictionary["test set"]["batch indices"] = ( data_dictionary["batch indices"][test_indices]) duration = time() - start_time print("Data set split ({}).".format(format_duration(duration))) return split_data_dictionary
def filter_examples(values_dictionary, example_names, method=None, parameters=None, labels=None, excluded_classes=None, superset_labels=None, excluded_superset_classes=None, batch_indices=None, count_sum=None): print("Filtering examples.") start_time = time() method = normalise_string(method) if superset_labels is not None: filter_labels = superset_labels.copy() filter_excluded_classes = excluded_superset_classes elif labels is not None: filter_labels = labels.copy() filter_excluded_classes = excluded_classes else: filter_labels = None filter_class_names = numpy.unique(filter_labels) if type(values_dictionary) == dict: values = values_dictionary["original"] n_examples, n_features = values.shape filter_indices = numpy.arange(n_examples) if method == "macosko": minimum_number_of_non_zero_elements = 900 number_of_non_zero_elements = (values != 0).sum(axis=1) filter_indices = numpy.nonzero( number_of_non_zero_elements > minimum_number_of_non_zero_elements )[0] elif method == "inverse_macosko": maximum_number_of_non_zero_elements = 900 number_of_non_zero_elements = (values != 0).sum(axis=1) filter_indices = numpy.nonzero( number_of_non_zero_elements <= maximum_number_of_non_zero_elements )[0] elif method in ["keep", "remove", "excluded_classes"]: if filter_labels is None: raise ValueError( "Cannot filter examples based on labels, " "since data set is unlabelled." ) if method == "excluded_classes": method = "remove" parameters = filter_excluded_classes if method == "keep": label_indices = set() for parameter in parameters: for class_name in filter_class_names: normalised_class_name = normalise_string(str(class_name)) normalised_parameter = normalise_string(str(parameter)) if normalised_class_name == normalised_parameter: class_indices = filter_labels == class_name label_indices.update(filter_indices[class_indices]) filter_indices = filter_indices[list(label_indices)] elif method == "remove": for parameter in parameters: for class_name in filter_class_names: normalised_class_name = normalise_string(str(class_name)) normalised_parameter = normalise_string(str(parameter)) if normalised_class_name == normalised_parameter: label_indices = filter_labels != class_name filter_labels = filter_labels[label_indices] filter_indices = filter_indices[label_indices] elif method == "remove_count_sum_above": threshold = int(parameters[0]) filter_indices = filter_indices[count_sum.reshape(-1) <= threshold] elif method == "random": n_samples = int(parameters[0]) n_samples = min(n_samples, n_examples) random_state = numpy.random.RandomState(90) filter_indices = random_state.permutation(n_examples)[:n_samples] else: raise ValueError( "Example filter `{}` not found.".format(method)) if method and len(filter_indices) == n_examples: raise Exception( "No examples filtered out using example filter `{}`." .format(method) ) example_filtered_values = {} for version, values in values_dictionary.items(): if values is not None: example_filtered_values[version] = values[filter_indices, :] else: example_filtered_values[version] = None example_filtered_example_names = example_names[filter_indices] if labels is not None: example_filtered_labels = labels[filter_indices] else: example_filtered_labels = None if batch_indices is not None: example_filtered_batch_indices = batch_indices[filter_indices] else: example_filtered_batch_indices = None n_examples_changed = len(example_filtered_example_names) duration = time() - start_time print("{} examples filtered out, {} remaining ({}).".format( n_examples - n_examples_changed, n_examples_changed, format_duration(duration) )) return (example_filtered_values, example_filtered_example_names, example_filtered_labels, example_filtered_batch_indices)
def binarise(self): if self.preprocessed_values is None: raise NotImplementedError( "Data set values have to have been preprocessed and feature" " selected first.") binarise_preprocessing = ["binarise"] sparse_path = self._build_preprocessed_path( map_features=self.map_features, preprocessing_methods=binarise_preprocessing, feature_selection=self.feature_selection, feature_selection_parameters=self.feature_selection_parameters, example_filter=self.example_filter, example_filter_parameters=self.example_filter_parameters) if os.path.isfile(sparse_path): print("Loading binarised data.") data_dictionary = internal_io.load_data_dictionary(sparse_path) else: binarising_time_start = time() if self.preprocessing_methods != binarise_preprocessing: print("Binarising values.") start_time = time() binarisation_function = processing.build_preprocessor( binarise_preprocessing) binarised_values = binarisation_function(self.values) duration = time() - start_time print("Values binarised ({}).".format( format_duration(duration))) print() elif self.preprocessing_methods == binarise_preprocessing: binarised_values = self.preprocessed_values data_dictionary = { "values": self.values, "preprocessed values": binarised_values, "feature names": self.feature_names } binarising_duration = time() - binarising_time_start if binarising_duration > MINIMUM_NUMBER_OF_SECONDS_BEFORE_SAVING: if not os.path.exists(self.preprocess_directory): os.makedirs(self.preprocess_directory) print("Saving binarised data set.") internal_io.save_data_dictionary(data_dictionary, sparse_path) binarised_values = sparse.SparseRowMatrix(binarised_values) self.update(binarised_values=data_dictionary["preprocessed values"])
def preprocess(self): if (not self.map_features and not self.preprocessing_methods and not self.feature_selection and not self.example_filter): self.update(preprocessed_values=None) return sparse_path = self._build_preprocessed_path( map_features=self.map_features, preprocessing_methods=self.preprocessing_methods, feature_selection=self.feature_selection, feature_selection_parameters=self.feature_selection_parameters, example_filter=self.example_filter, example_filter_parameters=self.example_filter_parameters) if os.path.isfile(sparse_path): print("Loading preprocessed data.") data_dictionary = internal_io.load_data_dictionary(sparse_path) if "preprocessed values" not in data_dictionary: data_dictionary["preprocessed values"] = None if self.map_features: self.features_mapped = True self.tags = _update_tag_for_mapped_features(self.tags) print() else: preprocessing_time_start = time() values = self.values example_names = self.example_names feature_names = self.feature_names if self.map_features and not self.features_mapped: print( "Mapping {} original features to {} new features.".format( self.number_of_features, len(self.feature_mapping))) start_time = time() values, feature_names = processing.map_features( values, feature_names, self.feature_mapping) self.features_mapped = True self.tags = _update_tag_for_mapped_features(self.tags) duration = time() - start_time print("Features mapped ({}).".format( format_duration(duration))) print() if not self.preprocessed and self.preprocessing_methods: print("Preprocessing values.") start_time = time() preprocessing_function = processing.build_preprocessor( self.preprocessing_methods) preprocessed_values = preprocessing_function(values) duration = time() - start_time print("Values preprocessed ({}).".format( format_duration(duration))) print() else: preprocessed_values = None if self.feature_selection: values_dictionary, feature_names = processing.select_features( { "original": values, "preprocessed": preprocessed_values }, self.feature_names, self.feature_selection, self.feature_selection_parameters) values = values_dictionary["original"] preprocessed_values = values_dictionary["preprocessed"] print() if self.example_filter: values_dictionary, example_names, labels, batch_indices = ( processing.filter_examples( { "original": values, "preprocessed": preprocessed_values }, self.example_names, self.example_filter, self.example_filter_parameters, labels=self.labels, excluded_classes=self.excluded_classes, superset_labels=self.superset_labels, excluded_superset_classes=( self.excluded_superset_classes), batch_indices=self.batch_indices, count_sum=self.count_sum)) values = values_dictionary["original"] preprocessed_values = values_dictionary["preprocessed"] print() data_dictionary = { "values": values, "preprocessed values": preprocessed_values, } if self.features_mapped or self.feature_selection: data_dictionary["feature names"] = feature_names if self.example_filter: data_dictionary["example names"] = example_names data_dictionary["labels"] = labels data_dictionary["batch indices"] = batch_indices preprocessing_duration = time() - preprocessing_time_start if (preprocessing_duration > MINIMUM_NUMBER_OF_SECONDS_BEFORE_SAVING): if not os.path.exists(self.preprocess_directory): os.makedirs(self.preprocess_directory) print("Saving preprocessed data set.") internal_io.save_data_dictionary(data_dictionary, sparse_path) print() values = data_dictionary["values"] preprocessed_values = data_dictionary["preprocessed values"] if preprocessed_values is None: preprocessed_values = values if self.features_mapped or self.feature_selection: feature_names = data_dictionary["feature names"] else: feature_names = self.feature_names if self.example_filter: example_names = data_dictionary["example names"] labels = data_dictionary["labels"] batch_indices = data_dictionary["batch indices"] else: example_names = self.example_names labels = self.labels batch_indices = self.batch_indices values = sparse.SparseRowMatrix(values) preprocessed_values = sparse.SparseRowMatrix(preprocessed_values) self.update(values=values, preprocessed_values=preprocessed_values, example_names=example_names, feature_names=feature_names, labels=labels, batch_indices=batch_indices)