def fetch_sentences(basename, lang): assert lang in ('pl', 'plm', 'cu', 'cum', 'cut', 'cue', 'el', 'elm', 'elt'), "invalid lang " + lang real_lang = lang[:2] transformation = lang[2:] basename_with_lang = ("%s/%s" % (basename, real_lang)) try: #TODO maybe open ready metaphone files? with file("%s.sentences" % basename_with_lang) as f: t = [line.decode('utf-8').strip() for line in f.readlines()] except IOError: t = Text.from_file("%s.txt" % basename_with_lang, lang=real_lang).as_sentences_flat() if transformation: if transformation == 'm': return [metaphone_text(s, lang=real_lang) for s in t] elif transformation == 't': return [translit_pl(s, real_lang) for s in t] elif transformation == 'e': return [expand_cu(s, numbers=True) for s in t] return t
def highlight(result, query_string): result_words = result.split() result_ms = metaphone_text(result).split() query_ms = metaphone_text(query_string).split() r = [] last = [] for i, word, word_m in zip(xrange(len(result_words)), result_words, result_ms): if word_m in query_ms: r.append(' '.join(last)) r.append(word) last = [] else: last.append(word) r.append(' '.join(last)) assert len(r) % 2 == 1 return r
def preprocess(sent, use_metaphone=True): sent = re.sub('[¶♦\'=`^]', '', sent) sent = re.sub('([.,:;!?])', r' \1 ', sent) sent = re.sub('\s+', ' ', sent) if use_metaphone: sent = metaphone_text(sent, remove_vowels=False, max_length=20) sent = re.sub('\s?[-?]\s?', ' ', sent) sent = sent.strip() else: sent = sent.lower() return sent
def search(query_string, page_num=1, page_length=10): query_string = metaphone_text(query_string) ix = whoosh.index.open_dir(INDEX_DIR) query = QueryParser("content", ix.schema).parse(query_string) with ix.searcher() as searcher: global _last_results _last_results = searcher.search_page(query, page_num, pagelen=page_length) for result in _last_results: [name, lang, sent_num] = result['path'].split(':') yield { 'name' : name, 'lang' : lang, 'sent_num' : int(sent_num) }