def score(self, query, normalized=True, synonimizer=None, return_suffix_scores=False): """ Matches the string against the GAST using the algorithm described in [Chernyak, sections 1.3 & 1.4]. Expects the input string to consist of alphabet letters only (no whitespaces etc.) Returns the score (a float in [0, 1]). query -- Unicode """ query = query.replace(" ", "") result = 0 suffix_scores = {} # For each suffix of the string: for suffix_start in range(len(query)): suffix = query[suffix_start:] suffix_score = 0 suffix_result = 0 matched_chars = 0 nodes_matched = 0 child_node = self.root.chose_arc(suffix) while child_node: nodes_matched += 1 (str_ind, substr_start, substr_end) = child_node.arc() match = utils.match_strings( suffix, self.strings_collection[str_ind][substr_start:substr_end]) suffix_score += child_node.conditional_probability() matched_chars += match suffix = suffix[match:] if suffix and match == substr_end - substr_start: child_node = child_node.chose_arc(suffix) else: break if matched_chars: suffix_result = (suffix_score + matched_chars - nodes_matched) if normalized: suffix_result /= matched_chars result += suffix_result suffix_scores[query[suffix_start:]] = suffix_result result /= len(query) if return_suffix_scores: result = result, suffix_scores return result
def _ukkonen_first_phases(string_ind): """ Looks for the part of the string which is already encoded. Returns a tuple of form ([length of already encoded string preffix], [tree node to start the first explicit phase with], [path to go down at the beginning of the first explicit phase]). """ already_in_tree = 0 suffix = strings_collection[string_ind] starting_path = (0, 0, 0) starting_node = root child_node = starting_node.chose_arc(suffix) while child_node: (str_ind, substr_start, substr_end) = child_node.arc() match = utils.match_strings( suffix, strings_collection[str_ind][substr_start:substr_end]) already_in_tree += match if match == substr_end-substr_start: # matched the arc, proceed with child node suffix = suffix[match:] starting_node = child_node child_node = starting_node.chose_arc(suffix) else: # otherwise we will have to proceed certain path at the beginning # of the first explicit phase starting_path = (str_ind, substr_start, substr_start+match) break # For constant updating of all leafs, see [Gusfield {RUS}, p. 139] root._e[string_ind] = already_in_tree return (already_in_tree, starting_node, starting_path)
def score(self, query, normalized=True, synonimizer=None, return_suffix_scores=False): """ Matches the string against the GAST using the algorithm described in [Chernyak, sections 1.3 & 1.4]. Expects the input string to consist of alphabet letters only (no whitespaces etc.) Returns the score (a float in [0, 1]). query -- Unicode """ query = query.replace(" ", "") result = 0 suffix_scores = {} # For each suffix of the string: for suffix_start in xrange(len(query)): suffix = query[suffix_start:] suffix_score = 0 suffix_result = 0 matched_chars = 0 nodes_matched = 0 child_node = self.root.chose_arc(suffix) while child_node: nodes_matched += 1 (str_ind, substr_start, substr_end) = child_node.arc() match = utils.match_strings( suffix, self.strings_collection[str_ind][substr_start:substr_end]) suffix_score += child_node.conditional_probability() matched_chars += match suffix = suffix[match:] if suffix and match == substr_end - substr_start: child_node = child_node.chose_arc(suffix) else: break if matched_chars: suffix_result = (suffix_score + matched_chars - nodes_matched) if normalized: suffix_result /= matched_chars result += suffix_result suffix_scores[query[suffix_start:]] = suffix_result result /= len(query) if return_suffix_scores: result = result, suffix_scores return result
def _score(self, query, normalized=True, return_suffix_scores=False): result = 0 suffix_scores = {} n = len(self.suftab) root_interval = (0, 0, n - 1) for suffix_start in range(len(query)): suffix = query[suffix_start:] suffix_score = 0 suffix_result = 0 matched_chars = 0 nodes_matched = 0 parent_node = root_interval child_node = self._get_child_interval(parent_node[1], parent_node[2], suffix[0]) while child_node: nodes_matched += 1 # TODO: Use structs??? child_node[1] is actually cn.i; parent_node[0] == pn.l substr_start = self.suftab[child_node[1]] + parent_node[0] if self._is_leaf(child_node): substr_end = n else: substr_end = substr_start + child_node[0] - parent_node[0] match = utils.match_strings( suffix, self.string[substr_start:substr_end]) suffix_score += float(self._annotation( child_node)) / self._annotation(parent_node) matched_chars += match suffix = suffix[match:] if suffix and match == substr_end - substr_start: parent_node = child_node child_node = self._get_child_interval( parent_node[1], parent_node[2], suffix[0]) else: break if matched_chars: suffix_result = (suffix_score + matched_chars - nodes_matched) if normalized: suffix_result /= matched_chars result += suffix_result suffix_scores[query[suffix_start:]] = suffix_result result /= len(query) if return_suffix_scores: result = result, suffix_scores return result
def _score(self, query, normalized=True, return_suffix_scores=False): result = 0 suffix_scores = {} n = len(self.suftab) root_interval = (0, 0, n - 1) for suffix_start in xrange(len(query)): suffix = query[suffix_start:] suffix_score = 0 suffix_result = 0 matched_chars = 0 nodes_matched = 0 parent_node = root_interval child_node = self._get_child_interval(parent_node[1], parent_node[2], suffix[0]) while child_node: nodes_matched += 1 # TODO: Use structs??? child_node[1] is actually cn.i; parent_node[0] == pn.l substr_start = self.suftab[child_node[1]] + parent_node[0] if self._is_leaf(child_node): substr_end = n else: substr_end = substr_start + child_node[0] - parent_node[0] match = utils.match_strings(suffix, self.string[substr_start:substr_end]) suffix_score += float(self._annotation(child_node)) / self._annotation(parent_node) matched_chars += match suffix = suffix[match:] if suffix and match == substr_end - substr_start: parent_node = child_node child_node = self._get_child_interval(parent_node[1], parent_node[2], suffix[0]) else: break if matched_chars: suffix_result = (suffix_score + matched_chars - nodes_matched) if normalized: suffix_result /= matched_chars result += suffix_result suffix_scores[query[suffix_start:]] = suffix_result result /= len(query) if return_suffix_scores: result = result, suffix_scores return result
def test_match_strings_empty(self): self.assertEqual(utils.match_strings("abc", "bc"), 0) self.assertEqual(utils.match_strings("", ""), 0)
def test_match_strings_full(self): self.assertEqual(utils.match_strings("abc", "abc"), 3) self.assertEqual(utils.match_strings("abc", "abcd"), 3)
def test_match_strings_partial(self): self.assertEqual(utils.match_strings("abc", "ac"), 1) self.assertEqual(utils.match_strings("mnc", "mnd"), 2)
def _construct(self, strings_collection): ''' Naive generalized suffix tree construction algorithm, with quadratic [O(n_1^2 + ... + n_m^2)] worst-case time complexity, where m is the number of strings in collection. ''' # 0. Add a unique character to each string in the collection, # to preserve simplicity while building the tree strings_collection = utils.make_unique_endings(strings_collection) root = ast.AnnotatedSuffixTree.Node() root.strings_collection = strings_collection # For each string in the collection... for string_ind in xrange(len(strings_collection)): string = strings_collection[string_ind] # For each suffix of that string... # (do not handle unique last characters as suffixes) for suffix_start in xrange(len(string)-1): suffix = string[suffix_start:] # ... first try to find maximal matching path node = root child_node = node.chose_arc(suffix) while child_node: (str_ind, substr_start, substr_end) = child_node.arc() match = utils.match_strings( suffix, strings_collection[str_ind][substr_start:substr_end]) if match == substr_end-substr_start: # matched the arc, proceed with child node suffix = suffix[match:] suffix_start += match node = child_node node.weight += 1 child_node = node.chose_arc(suffix) else: # ... then, where the matching path ends; # create new inner node # (that's the only possible alternative # since we have unique string endings) node.remove_child(child_node) new_node = node.add_new_child(string_ind, suffix_start, suffix_start+match) new_leaf = new_node.add_new_child(string_ind, suffix_start+match, len(string)) (osi, oss, ose) = child_node._arc child_node._arc = (osi, oss+match, ose) new_node.add_child(child_node) new_leaf.weight = 1 new_node.weight = 1 + child_node.weight suffix = '' break # ... or create new leaf if there was no appropriate arc to proceed if suffix: new_leaf = node.add_new_child(string_ind, suffix_start, len(string)) new_leaf.weight = 1 # Root will also be annotated by the weight of its children, # to preserve simplicity while calculating string matching for k in root.children: root.weight += root.children[k].weight return root
def test_match_strings_empty(self): self.assertEqual(utils.match_strings("abc", "bc"), 0) self.assertEqual(utils.match_strings("", ""), 0)
def test_match_strings_full(self): self.assertEqual(utils.match_strings("abc", "abc"), 3) self.assertEqual(utils.match_strings("abc", "abcd"), 3)
def test_match_strings_partial(self): self.assertEqual(utils.match_strings("abc", "ac"), 1) self.assertEqual(utils.match_strings("mnc", "mnd"), 2)
def _construct(self, strings_collection): """ Naive generalized suffix tree construction algorithm, with quadratic [O(n_1^2 + ... + n_m^2)] worst-case time complexity, where m is the number of strings in collection. """ # 0. Add a unique character to each string in the collection, # to preserve simplicity while building the tree strings_collection = utils.make_unique_endings(strings_collection) root = ast.AnnotatedSuffixTree.Node() root.strings_collection = strings_collection # For each string in the collection... for string_ind in xrange(len(strings_collection)): string = strings_collection[string_ind] # For each suffix of that string... # (do not handle unique last characters as suffixes) for suffix_start in xrange(len(string) - 1): suffix = string[suffix_start:] # ... first try to find maximal matching path node = root child_node = node.chose_arc(suffix) while child_node: (str_ind, substr_start, substr_end) = child_node.arc() match = utils.match_strings( suffix, strings_collection[str_ind][substr_start:substr_end]) if match == substr_end - substr_start: # matched the arc, proceed with child node suffix = suffix[match:] suffix_start += match node = child_node node.weight += 1 child_node = node.chose_arc(suffix) else: # ... then, where the matching path ends; # create new inner node # (that's the only possible alternative # since we have unique string endings) node.remove_child(child_node) new_node = node.add_new_child(string_ind, suffix_start, suffix_start + match) new_leaf = new_node.add_new_child( string_ind, suffix_start + match, len(string)) (osi, oss, ose) = child_node._arc child_node._arc = (osi, oss + match, ose) new_node.add_child(child_node) new_leaf.weight = 1 new_node.weight = 1 + child_node.weight suffix = '' break # ... or create new leaf if there was no appropriate arc to proceed if suffix: new_leaf = node.add_new_child(string_ind, suffix_start, len(string)) new_leaf.weight = 1 # Root will also be annotated by the weight of its children, # to preserve simplicity while calculating string matching for k in root.children: root.weight += root.children[k].weight return root