def naive_has_similar(self, s, distance): for t in self.strings: if len(t) != len(s): continue if hamming_distance(t, s) <= distance: return True return False
def hamming_single_linkage(strings, mismatches, use_trie=False): """ Cluster a set of strings by their hamming distance: Strings with a distance of at most 'mismatches' will be put into the same cluster. Return a list of connected components (clusters). """ components = [] # First pre-cluster strings by length string_lists = defaultdict(list) for s in strings: string_lists[len(s)].append(s) for strings in string_lists.values(): graph = Graph(strings) if use_trie: trie = Trie() for s in strings: trie.add(s) for s in strings: for neighbor in trie.find_all_similar(s, mismatches): if neighbor != s: graph.add_edge(s, neighbor) else: for i, s in enumerate(strings): for j, t in enumerate(strings[i + 1:]): if hamming_distance(s, t) <= mismatches: graph.add_edge(s, t) components.extend(graph.connected_components()) return components
def naive_find_all_similar(self, s, distance): for t in self.strings: if len(t) != len(s): continue if hamming_distance(t, s) <= distance: yield t
def naive_has_similar(t): for s in strings: if hamming_distance(s, t) <= dist: return True return False