예제 #1
0
 def naive_has_similar(self, s, distance):
     for t in self.strings:
         if len(t) != len(s):
             continue
         if hamming_distance(t, s) <= distance:
             return True
     return False
예제 #2
0
def hamming_single_linkage(strings, mismatches, use_trie=False):
    """
	Cluster a set of strings by their hamming distance: Strings with
	a distance of at most 'mismatches' will be put into the same cluster.

	Return a list of connected components (clusters).
	"""
    components = []

    # First pre-cluster strings by length
    string_lists = defaultdict(list)
    for s in strings:
        string_lists[len(s)].append(s)
    for strings in string_lists.values():
        graph = Graph(strings)
        if use_trie:
            trie = Trie()
            for s in strings:
                trie.add(s)
            for s in strings:
                for neighbor in trie.find_all_similar(s, mismatches):
                    if neighbor != s:
                        graph.add_edge(s, neighbor)
        else:
            for i, s in enumerate(strings):
                for j, t in enumerate(strings[i + 1:]):
                    if hamming_distance(s, t) <= mismatches:
                        graph.add_edge(s, t)

        components.extend(graph.connected_components())
    return components
예제 #3
0
 def naive_find_all_similar(self, s, distance):
     for t in self.strings:
         if len(t) != len(s):
             continue
         if hamming_distance(t, s) <= distance:
             yield t
예제 #4
0
 def naive_has_similar(t):
     for s in strings:
         if hamming_distance(s, t) <= dist:
             return True
     return False