def compare(first_data_set: DataSet, second_data_set: DataSet) -> tuple: """ Compare given data sets by computing the number of metadata entries present in first but not in second data set and vice versa. Return both values as a tuple of integers. Intuitively, both numbers are 0 in case of strictly identical metadata structures. :param first_data_set: DataSet - Data points and corresponding metadata structures. :param second_data_set: DataSet - Data points and corresponding metadata structures. :return: (int, int) - Number of metadata elements present in first but not in second data set (and vice versa). :raise ValueError: In case one DataSet instance is missing. """ # Check if first data structure is # actually not a DataSet instance. if not isinstance(first_data_set, DataSet): raise ValueError # Check if second data structure is # actually not a DataSet instance. if not isinstance(second_data_set, DataSet): raise ValueError # Collect metadata entries of first # data set in a single set structure. first_metadata = set() for i in range(len(first_data_set)): metadata = first_data_set.get_metadata_at_index(i) first_metadata.update(metadata) # Collect metadata entries of second # data set in a single set structure. second_metadata = set() for i in range(len(second_data_set)): metadata = second_data_set.get_metadata_at_index(i) second_metadata.update(metadata) num_unique_elements_in_first_metadata = len( first_metadata.difference(second_metadata)) num_unique_elements_in_second_metadata = len( second_metadata.difference(first_metadata)) return num_unique_elements_in_first_metadata, num_unique_elements_in_second_metadata
def _get_clustered_data_set(data_set: DataSet, clusters: list) -> DataSet: """ Combine vectors within each cluster to a single data point and corresponding metadata structure. Use mean vector as the cluster's representative and all metadata structures excluding duplicates. :param data_set: DataSet - Data points and corresponding metadata structures. :param clusters: list(list) - Data point indices contained in each cluster. :return: DataSet - Clustered data points and corresponding metadata structures. """ # Create lists to store compressed vectors # and corresponding metadata structures. new_data_points = list() new_metadata = list() for index_list in clusters: # Compress data points in current cluster # by computing corresponding mean vector. current_data_points = [ data_set.get_vector_at_index(i) for i in index_list ] mean_vector = calculation.get_mean(current_data_points) new_data_points.append(mean_vector) # Compress metadata structures in current # cluster by using corresponding merge function. current_metadata_structure = set() for i in index_list: current_metadata_structure = DataSet.merge_metadata( current_metadata_structure, data_set.get_metadata_at_index(i)) new_metadata.append(current_metadata_structure) # Return compressed data. return DataSet(new_data_points, new_metadata)