예제 #1
0
    def score(self,
              query,
              normalized=True,
              synonimizer=None,
              return_suffix_scores=False):
        """
        Matches the string against the GAST using
        the algorithm described in [Chernyak, sections 1.3 & 1.4].
        
        Expects the input string to consist of
        alphabet letters only (no whitespaces etc.)
        
        Returns the score (a float in [0, 1]).
        
        query -- Unicode
        
        """

        query = query.replace(" ", "")
        result = 0
        suffix_scores = {}

        # For each suffix of the string:
        for suffix_start in range(len(query)):

            suffix = query[suffix_start:]
            suffix_score = 0
            suffix_result = 0
            matched_chars = 0
            nodes_matched = 0

            child_node = self.root.chose_arc(suffix)
            while child_node:
                nodes_matched += 1
                (str_ind, substr_start, substr_end) = child_node.arc()
                match = utils.match_strings(
                    suffix,
                    self.strings_collection[str_ind][substr_start:substr_end])
                suffix_score += child_node.conditional_probability()
                matched_chars += match
                suffix = suffix[match:]
                if suffix and match == substr_end - substr_start:
                    child_node = child_node.chose_arc(suffix)
                else:
                    break

            if matched_chars:
                suffix_result = (suffix_score + matched_chars - nodes_matched)
                if normalized:
                    suffix_result /= matched_chars
                result += suffix_result

            suffix_scores[query[suffix_start:]] = suffix_result

        result /= len(query)

        if return_suffix_scores:
            result = result, suffix_scores

        return result
예제 #2
0
 def _ukkonen_first_phases(string_ind):
     """
     Looks for the part of the string which is already encoded.
     Returns a tuple of form
     ([length of already encoded string preffix],
      [tree node to start the first explicit phase with],
      [path to go down at the beginning of the first explicit phase]).
     
     """
     already_in_tree = 0
     suffix = strings_collection[string_ind]
     starting_path = (0, 0, 0)
     starting_node = root
     child_node = starting_node.chose_arc(suffix)
     while child_node:
         (str_ind, substr_start, substr_end) = child_node.arc()
         match = utils.match_strings(
                     suffix, strings_collection[str_ind][substr_start:substr_end])
         already_in_tree += match
         if match == substr_end-substr_start:
             # matched the arc, proceed with child node
             suffix = suffix[match:]
             starting_node = child_node
             child_node = starting_node.chose_arc(suffix)
         else:
             # otherwise we will have to proceed certain path at the beginning
             # of the first explicit phase
             starting_path = (str_ind, substr_start, substr_start+match)
             break
     # For constant updating of all leafs, see [Gusfield {RUS}, p. 139]
     root._e[string_ind] = already_in_tree
         
     return (already_in_tree, starting_node, starting_path)
예제 #3
0
    def score(self, query, normalized=True, synonimizer=None, return_suffix_scores=False):
        """
        Matches the string against the GAST using
        the algorithm described in [Chernyak, sections 1.3 & 1.4].
        
        Expects the input string to consist of
        alphabet letters only (no whitespaces etc.)
        
        Returns the score (a float in [0, 1]).
        
        query -- Unicode
        
        """
        
        query = query.replace(" ", "")
        result = 0
        suffix_scores = {}
    
        # For each suffix of the string:
        for suffix_start in xrange(len(query)):
            
            suffix = query[suffix_start:]
            suffix_score = 0
            suffix_result = 0
            matched_chars = 0
            nodes_matched = 0
            
            child_node = self.root.chose_arc(suffix)
            while child_node:
                nodes_matched += 1
                (str_ind, substr_start, substr_end) = child_node.arc()
                match = utils.match_strings(
                            suffix, self.strings_collection[str_ind][substr_start:substr_end])
                suffix_score += child_node.conditional_probability()
                matched_chars += match
                suffix = suffix[match:]
                if suffix and match == substr_end - substr_start:
                    child_node = child_node.chose_arc(suffix)
                else:
                    break
            
            if matched_chars:
                suffix_result = (suffix_score + matched_chars - nodes_matched)
                if normalized:
                    suffix_result /= matched_chars
                result += suffix_result

            suffix_scores[query[suffix_start:]] = suffix_result
                    
        result /= len(query)

        if return_suffix_scores:
            result = result, suffix_scores
        
        return result
예제 #4
0
    def _score(self, query, normalized=True, return_suffix_scores=False):
        result = 0
        suffix_scores = {}
        n = len(self.suftab)

        root_interval = (0, 0, n - 1)

        for suffix_start in range(len(query)):

            suffix = query[suffix_start:]
            suffix_score = 0
            suffix_result = 0
            matched_chars = 0
            nodes_matched = 0

            parent_node = root_interval
            child_node = self._get_child_interval(parent_node[1],
                                                  parent_node[2], suffix[0])
            while child_node:
                nodes_matched += 1
                # TODO: Use structs??? child_node[1] is actually cn.i; parent_node[0] == pn.l
                substr_start = self.suftab[child_node[1]] + parent_node[0]
                if self._is_leaf(child_node):
                    substr_end = n
                else:
                    substr_end = substr_start + child_node[0] - parent_node[0]
                match = utils.match_strings(
                    suffix, self.string[substr_start:substr_end])
                suffix_score += float(self._annotation(
                    child_node)) / self._annotation(parent_node)
                matched_chars += match
                suffix = suffix[match:]
                if suffix and match == substr_end - substr_start:
                    parent_node = child_node
                    child_node = self._get_child_interval(
                        parent_node[1], parent_node[2], suffix[0])
                else:
                    break

            if matched_chars:
                suffix_result = (suffix_score + matched_chars - nodes_matched)
                if normalized:
                    suffix_result /= matched_chars
                result += suffix_result

            suffix_scores[query[suffix_start:]] = suffix_result

        result /= len(query)

        if return_suffix_scores:
            result = result, suffix_scores

        return result
예제 #5
0
    def _score(self, query, normalized=True, return_suffix_scores=False):
        result = 0
        suffix_scores = {}
        n = len(self.suftab)

        root_interval = (0, 0, n - 1)
    
        for suffix_start in xrange(len(query)):
            
            suffix = query[suffix_start:]
            suffix_score = 0
            suffix_result = 0
            matched_chars = 0
            nodes_matched = 0
            
            parent_node = root_interval
            child_node = self._get_child_interval(parent_node[1], parent_node[2], suffix[0])
            while child_node:
                nodes_matched += 1
                # TODO: Use structs??? child_node[1] is actually cn.i; parent_node[0] == pn.l
                substr_start = self.suftab[child_node[1]] + parent_node[0]
                if self._is_leaf(child_node):
                    substr_end = n
                else:
                    substr_end = substr_start + child_node[0] - parent_node[0]
                match = utils.match_strings(suffix, self.string[substr_start:substr_end])
                suffix_score += float(self._annotation(child_node)) / self._annotation(parent_node)
                matched_chars += match
                suffix = suffix[match:]
                if suffix and match == substr_end - substr_start:
                    parent_node = child_node
                    child_node = self._get_child_interval(parent_node[1], parent_node[2], suffix[0])
                else:
                    break

            if matched_chars:
                suffix_result = (suffix_score + matched_chars - nodes_matched)
                if normalized:
                    suffix_result /= matched_chars
                result += suffix_result

            suffix_scores[query[suffix_start:]] = suffix_result
            
        result /= len(query)

        if return_suffix_scores:
            result = result, suffix_scores

        return result
예제 #6
0
 def test_match_strings_empty(self):
     self.assertEqual(utils.match_strings("abc", "bc"), 0)
     self.assertEqual(utils.match_strings("", ""), 0)
예제 #7
0
 def test_match_strings_full(self):
     self.assertEqual(utils.match_strings("abc", "abc"), 3)
     self.assertEqual(utils.match_strings("abc", "abcd"), 3)
예제 #8
0
 def test_match_strings_partial(self):
     self.assertEqual(utils.match_strings("abc", "ac"), 1)
     self.assertEqual(utils.match_strings("mnc", "mnd"), 2)
예제 #9
0
 def _construct(self, strings_collection):
     '''
     Naive generalized suffix tree construction algorithm,
     with quadratic [O(n_1^2 + ... + n_m^2)] worst-case time complexity,
     where m is the number of strings in collection.
     
     '''
     
     # 0. Add a unique character to each string in the collection,
     #    to preserve simplicity while building the tree
     strings_collection = utils.make_unique_endings(strings_collection)
     
     root = ast.AnnotatedSuffixTree.Node()
     root.strings_collection = strings_collection
     
     # For each string in the collection...
     for string_ind in xrange(len(strings_collection)):
         string = strings_collection[string_ind]
         # For each suffix of that string...
         # (do not handle unique last characters as suffixes)
         for suffix_start in xrange(len(string)-1):
             suffix = string[suffix_start:]
             # ... first try to find maximal matching path
             node = root
             child_node = node.chose_arc(suffix)
             while child_node:
                 (str_ind, substr_start, substr_end) = child_node.arc()
                 match = utils.match_strings(
                             suffix, strings_collection[str_ind][substr_start:substr_end])
                 if match == substr_end-substr_start:
                     # matched the arc, proceed with child node
                     suffix = suffix[match:]
                     suffix_start += match
                     node = child_node
                     node.weight += 1
                     child_node = node.chose_arc(suffix)
                 else:
                     # ... then, where the matching path ends;
                     # create new inner node
                     # (that's the only possible alternative
                     #  since we have unique string endings)
                     node.remove_child(child_node)
                     new_node = node.add_new_child(string_ind, suffix_start,
                                                   suffix_start+match)
                     new_leaf = new_node.add_new_child(string_ind, suffix_start+match,
                                                       len(string))
                     (osi, oss, ose) = child_node._arc
                     child_node._arc = (osi, oss+match, ose)
                     new_node.add_child(child_node)
                     new_leaf.weight = 1
                     new_node.weight = 1 + child_node.weight
                     suffix = ''
                     break
                     
             # ... or create new leaf if there was no appropriate arc to proceed
             if suffix:
                 new_leaf = node.add_new_child(string_ind, suffix_start, len(string))
                 new_leaf.weight = 1
                 
     # Root will also be annotated by the weight of its children,
     # to preserve simplicity while calculating string matching
     for k in root.children:
         root.weight += root.children[k].weight
     
     return root
예제 #10
0
 def test_match_strings_empty(self):
     self.assertEqual(utils.match_strings("abc", "bc"), 0)
     self.assertEqual(utils.match_strings("", ""), 0)
예제 #11
0
 def test_match_strings_full(self):
     self.assertEqual(utils.match_strings("abc", "abc"), 3)
     self.assertEqual(utils.match_strings("abc", "abcd"), 3)
예제 #12
0
 def test_match_strings_partial(self):
     self.assertEqual(utils.match_strings("abc", "ac"), 1)
     self.assertEqual(utils.match_strings("mnc", "mnd"), 2)
예제 #13
0
    def _construct(self, strings_collection):
        """
        Naive generalized suffix tree construction algorithm,
        with quadratic [O(n_1^2 + ... + n_m^2)] worst-case time complexity,
        where m is the number of strings in collection.
        
        """

        # 0. Add a unique character to each string in the collection,
        #    to preserve simplicity while building the tree
        strings_collection = utils.make_unique_endings(strings_collection)

        root = ast.AnnotatedSuffixTree.Node()
        root.strings_collection = strings_collection

        # For each string in the collection...
        for string_ind in xrange(len(strings_collection)):
            string = strings_collection[string_ind]
            # For each suffix of that string...
            # (do not handle unique last characters as suffixes)
            for suffix_start in xrange(len(string) - 1):
                suffix = string[suffix_start:]
                # ... first try to find maximal matching path
                node = root
                child_node = node.chose_arc(suffix)
                while child_node:
                    (str_ind, substr_start, substr_end) = child_node.arc()
                    match = utils.match_strings(
                        suffix,
                        strings_collection[str_ind][substr_start:substr_end])
                    if match == substr_end - substr_start:
                        # matched the arc, proceed with child node
                        suffix = suffix[match:]
                        suffix_start += match
                        node = child_node
                        node.weight += 1
                        child_node = node.chose_arc(suffix)
                    else:
                        # ... then, where the matching path ends;
                        # create new inner node
                        # (that's the only possible alternative
                        #  since we have unique string endings)
                        node.remove_child(child_node)
                        new_node = node.add_new_child(string_ind, suffix_start,
                                                      suffix_start + match)
                        new_leaf = new_node.add_new_child(
                            string_ind, suffix_start + match, len(string))
                        (osi, oss, ose) = child_node._arc
                        child_node._arc = (osi, oss + match, ose)
                        new_node.add_child(child_node)
                        new_leaf.weight = 1
                        new_node.weight = 1 + child_node.weight
                        suffix = ''
                        break

                # ... or create new leaf if there was no appropriate arc to proceed
                if suffix:
                    new_leaf = node.add_new_child(string_ind, suffix_start,
                                                  len(string))
                    new_leaf.weight = 1

        # Root will also be annotated by the weight of its children,
        # to preserve simplicity while calculating string matching
        for k in root.children:
            root.weight += root.children[k].weight

        return root