def search(self, input): """Returns tuple(index:str, resp:list, priority:float) Note that this method doesn't conform the ABC, you should only use it for debugging purpose or you are ONLY using this engine""" data = {} for index, resp in self._search_db(input): key = frozenset(index.split()) resp = resp.split('\f') if key in data: data[key].extend(resp) else: data[key] = resp diff = SequenceMatcher(partial(contains, '?,./<>`~!@#$%&*()_+-={}[];:\'"|\\'), input + ' '.join(self.state)) cleaned = strip_clean(input.lower()) cleaned_words = cleaned.split() words = self.state.union(cleaned_words) def matches(entry): for key in entry[0]: if not any(imap(methodcaller('startswith', key), words)): return False return True def getdiff(text): diff.set_seq2(text) return diff.ratio() data = filter(matches, data.iteritems()) data = [(index, resp, getdiff(' '.join(sorted(index, key=cleaned.find)))) for index, resp in data] data.sort(key=itemgetter(2), reverse=True) self.state = keywords(input) return data
def acronyms(self): for word in self.words[:]: if word in acronyms: self.text = re.sub(re.escape(word), '', self.text, flags=re.I) caps_count = len([None for i in self.text if i in ascii_uppercase]) letter_count = len([None for i in self.text if i in ascii_letters]) if letter_count != 0 and caps_count / letter_count < .5: # Doesn't look like all caps spam. self.text = recapword.sub('', self.text) self.lower = self.text.lower() self.words = strip_clean(self.lower, proper_letters).split()
def search(self, input): input = rewhite.sub(' ', strip_clean(input)) out = [] regexes = [] diff_ = SequenceMatcher(partial(contains, '?,./<>`~!@#$%&*()_+-={}[];:\'"|\\'), input) def diff(text): diff_.set_seq2(text) return diff_.ratio() for regex, resp in self._search_db(input): regexes.append((self.regex[regex], resp.split('\f'))) for regex, resp in regexes: match = regex.search(input) if match is not None: # Strip all backreference groups off if match.lastindex is not None: base = StringIO() last = match.start() for i in xrange(1, match.lastindex+1): base.write(input[last:match.start(i)]) last = match.end(i) base.write(input[last:match.end()]) priority = diff(base.getvalue()) else: priority = diff(match.group(0)) g0 = match.group(0) def expand(resp): return respaces.sub(' ', match.expand(resp.replace(r'\0', g0).replace('\g<0>', g0))) resp = map(expand, resp) # expand \1, \g<1>, \g<name> out.append((match, resp, priority)) out.sort(key=itemgetter(2), reverse=True) return out
def tokens(text): """Returns text as a list of words(tokens) in lowercase""" return strip_clean(text.lower(), alpha).split()
def __init__(self, text): self.text = text self.lower = text.lower() self.words = strip_clean(self.lower, proper_letters).split() self.error = 0 self.reasons = set()