예제 #1
0
def _deduplicate_by_sorting(data_set: DataSet, callable_comparator) -> DataSet:
    """
    Find and remove duplicates in given data
    set merging corresponding metadata structures.
    Sort data set with specified comparator in
    order to identify duplicated entries.
    :param data_set: DataSet - Data points and
        corresponding metadata structures.
    :param callable_comparator: Callable
        comparator function.
    :return: DataSet - Data set without duplicates.
    """
    # Use Python's zip() function to concatenate the vector
    # and metadata lists to a single list of tuples
    # mapping each vector to its metadata structure.
    data = zip(data_set.data_points, data_set.metadata)
    # Sort this list using specified comparator.
    sorted_data = sorted(data, key=functools.cmp_to_key(callable_comparator))
    # Create new lists to store unique data
    # points and corresponding metadata structures.
    new_data_points = list()
    new_metadata = list()
    # Iterate through sorted list
    # considering equal entries only once.
    last_entry = None
    for entry in sorted_data:
        if last_entry is None or callable_comparator(entry, last_entry) != 0:
            # Current data point was never seen before,
            # so append it to list of unique data points.
            new_data_points.append(entry[0])
            new_metadata.append(entry[1])
            last_entry = entry
        else:
            # Current data point is already present,
            # therefore only merge its metadata.
            new_metadata[-1] = DataSet.merge_metadata(new_metadata[-1], entry[1])
    # Return data set without duplicates.
    return DataSet(new_data_points, new_metadata)
예제 #2
0
def _get_clustered_data_set(data_set: DataSet, clusters: list) -> DataSet:
    """
    Combine vectors within each cluster to a single
    data point and corresponding metadata structure.
    Use mean vector as the cluster's representative
    and all metadata structures excluding duplicates.
    :param data_set: DataSet - Data points
        and corresponding metadata structures.
    :param clusters: list(list) - Data point
        indices contained in each cluster.
    :return: DataSet - Clustered data points
        and corresponding metadata structures.
    """
    # Create lists to store compressed vectors
    # and corresponding metadata structures.
    new_data_points = list()
    new_metadata = list()

    for index_list in clusters:
        # Compress data points in current cluster
        # by computing corresponding mean vector.
        current_data_points = [
            data_set.get_vector_at_index(i) for i in index_list
        ]
        mean_vector = calculation.get_mean(current_data_points)
        new_data_points.append(mean_vector)

        # Compress metadata structures in current
        # cluster by using corresponding merge function.
        current_metadata_structure = set()
        for i in index_list:
            current_metadata_structure = DataSet.merge_metadata(
                current_metadata_structure, data_set.get_metadata_at_index(i))
        new_metadata.append(current_metadata_structure)

    # Return compressed data.
    return DataSet(new_data_points, new_metadata)