def search(self, pattern, length=None, count=10, use_cromulence=False): """ Find results matching a given pattern, returning the cromulence and the text of each. If the length is known, it can be specified as an additional argument. """ pattern = unspaced_lower(pattern) if is_exact(pattern): if use_cromulence: return [self.cromulence(pattern)] else: return [self.text_logprob(pattern)] minlen, maxlen = regex_len(pattern) if minlen != maxlen: # If there are variable-length matches, the dynamic programming # strategy won't work, so fall back on grepping for complete # matches in the wordlist. items = list(self.grep(pattern, length=length)) items.sort(reverse=True) found = items[:count] else: if length is not None and not (minlen <= length <= maxlen): # This length is impossible, so there are no results. return [] best_partial_results = [[]] for right_edge in range(1, maxlen + 1): segment = regex_slice(pattern, 0, right_edge) results_this_step = list(islice(self.grep(segment), count)) for left_edge in range(1, right_edge): if best_partial_results[left_edge]: segment = regex_slice(pattern, left_edge, right_edge) found = list(islice(self.grep(segment), count)) for lprob, ltext in best_partial_results[left_edge]: for rprob, rtext in found: results_this_step.append( (lprob + rprob - log(10), ltext + ' ' + rtext)) results_this_step.sort(reverse=True) best_partial_results.append(results_this_step[:count]) found = best_partial_results[-1] if not use_cromulence: return found else: results = [] for (logprob, text) in found: cromulence = self.logprob_to_cromulence( logprob, len(slugify(text))) results.append((cromulence, text)) results.sort(reverse=True) return results
def grep(self, pattern, length=None, count=1000): """ Search the wordlist quickly for words matching a given pattern. Yield them as they are found (not in sorted order). Yields (logprob, text) for each match. """ pattern = unspaced_lower(pattern) if is_exact(pattern): if pattern in self: yield self.segment_logprob(pattern) return if length: minlen = maxlen = length else: minlen, maxlen = regex_len(pattern) if minlen < 1: minlen = 1 if maxlen > self.max_indexed_length: maxlen = self.max_indexed_length num_found = 0 for cur_length in range(minlen, maxlen + 1): if cur_length not in self._grep_maps: mm = self._open_mmap( wordlist_path_from_name('greppable/%s.%d' % (self.name, cur_length))) self._grep_maps[cur_length] = mm else: mm = self._grep_maps[cur_length] pbytes = pattern.encode('ascii') pattern1 = b'^' + pbytes + b',' pattern2 = b'\n' + pbytes + b',' match = re.match(pattern1, mm) if match: found = mm[match.start():match.end() - 1].decode('ascii') num_found += 1 yield self.segment_logprob(found) for match in re.finditer(pattern2, mm): found = mm[match.start() + 1:match.end() - 1].decode('ascii') num_found += 1 yield self.segment_logprob(found) if num_found >= count: return
def __len__(self): return regex_len(self.expr)