Exemplo n.º 1
0
    def search(self, pattern, length=None, count=10, use_cromulence=False):
        """
        Find results matching a given pattern, returning the cromulence
        and the text of each.

        If the length is known, it can be specified as an additional argument.
        """
        pattern = unspaced_lower(pattern)
        if is_exact(pattern):
            if use_cromulence:
                return [self.cromulence(pattern)]
            else:
                return [self.text_logprob(pattern)]

        minlen, maxlen = regex_len(pattern)
        if minlen != maxlen:
            # If there are variable-length matches, the dynamic programming
            # strategy won't work, so fall back on grepping for complete
            # matches in the wordlist.
            items = list(self.grep(pattern, length=length))
            items.sort(reverse=True)
            found = items[:count]
        else:
            if length is not None and not (minlen <= length <= maxlen):
                # This length is impossible, so there are no results.
                return []

            best_partial_results = [[]]
            for right_edge in range(1, maxlen + 1):
                segment = regex_slice(pattern, 0, right_edge)
                results_this_step = list(islice(self.grep(segment), count))

                for left_edge in range(1, right_edge):
                    if best_partial_results[left_edge]:
                        segment = regex_slice(pattern, left_edge, right_edge)
                        found = list(islice(self.grep(segment), count))
                        for lprob, ltext in best_partial_results[left_edge]:
                            for rprob, rtext in found:
                                results_this_step.append(
                                    (lprob + rprob - log(10),
                                     ltext + ' ' + rtext))
                results_this_step.sort(reverse=True)
                best_partial_results.append(results_this_step[:count])
            found = best_partial_results[-1]

        if not use_cromulence:
            return found
        else:
            results = []
            for (logprob, text) in found:
                cromulence = self.logprob_to_cromulence(
                    logprob, len(slugify(text)))
                results.append((cromulence, text))
            results.sort(reverse=True)
            return results
Exemplo n.º 2
0
    def grep(self, pattern, length=None, count=1000):
        """
        Search the wordlist quickly for words matching a given pattern.
        Yield them as they are found (not in sorted order).

        Yields (logprob, text) for each match.
        """
        pattern = unspaced_lower(pattern)
        if is_exact(pattern):
            if pattern in self:
                yield self.segment_logprob(pattern)
            return
        if length:
            minlen = maxlen = length
        else:
            minlen, maxlen = regex_len(pattern)
        if minlen < 1:
            minlen = 1
        if maxlen > self.max_indexed_length:
            maxlen = self.max_indexed_length

        num_found = 0
        for cur_length in range(minlen, maxlen + 1):
            if cur_length not in self._grep_maps:
                mm = self._open_mmap(
                    wordlist_path_from_name('greppable/%s.%d' %
                                            (self.name, cur_length)))
                self._grep_maps[cur_length] = mm
            else:
                mm = self._grep_maps[cur_length]
            pbytes = pattern.encode('ascii')
            pattern1 = b'^' + pbytes + b','
            pattern2 = b'\n' + pbytes + b','
            match = re.match(pattern1, mm)
            if match:
                found = mm[match.start():match.end() - 1].decode('ascii')
                num_found += 1
                yield self.segment_logprob(found)
            for match in re.finditer(pattern2, mm):
                found = mm[match.start() + 1:match.end() - 1].decode('ascii')

                num_found += 1
                yield self.segment_logprob(found)
                if num_found >= count:
                    return
Exemplo n.º 3
0
 def __len__(self):
     return regex_len(self.expr)