def _deduplicate_by_sorting(data_set: DataSet, callable_comparator) -> DataSet: """ Find and remove duplicates in given data set merging corresponding metadata structures. Sort data set with specified comparator in order to identify duplicated entries. :param data_set: DataSet - Data points and corresponding metadata structures. :param callable_comparator: Callable comparator function. :return: DataSet - Data set without duplicates. """ # Use Python's zip() function to concatenate the vector # and metadata lists to a single list of tuples # mapping each vector to its metadata structure. data = zip(data_set.data_points, data_set.metadata) # Sort this list using specified comparator. sorted_data = sorted(data, key=functools.cmp_to_key(callable_comparator)) # Create new lists to store unique data # points and corresponding metadata structures. new_data_points = list() new_metadata = list() # Iterate through sorted list # considering equal entries only once. last_entry = None for entry in sorted_data: if last_entry is None or callable_comparator(entry, last_entry) != 0: # Current data point was never seen before, # so append it to list of unique data points. new_data_points.append(entry[0]) new_metadata.append(entry[1]) last_entry = entry else: # Current data point is already present, # therefore only merge its metadata. new_metadata[-1] = DataSet.merge_metadata(new_metadata[-1], entry[1]) # Return data set without duplicates. return DataSet(new_data_points, new_metadata)
def _get_clustered_data_set(data_set: DataSet, clusters: list) -> DataSet: """ Combine vectors within each cluster to a single data point and corresponding metadata structure. Use mean vector as the cluster's representative and all metadata structures excluding duplicates. :param data_set: DataSet - Data points and corresponding metadata structures. :param clusters: list(list) - Data point indices contained in each cluster. :return: DataSet - Clustered data points and corresponding metadata structures. """ # Create lists to store compressed vectors # and corresponding metadata structures. new_data_points = list() new_metadata = list() for index_list in clusters: # Compress data points in current cluster # by computing corresponding mean vector. current_data_points = [ data_set.get_vector_at_index(i) for i in index_list ] mean_vector = calculation.get_mean(current_data_points) new_data_points.append(mean_vector) # Compress metadata structures in current # cluster by using corresponding merge function. current_metadata_structure = set() for i in index_list: current_metadata_structure = DataSet.merge_metadata( current_metadata_structure, data_set.get_metadata_at_index(i)) new_metadata.append(current_metadata_structure) # Return compressed data. return DataSet(new_data_points, new_metadata)