示例#1
0
def fetch_sentences(basename, lang):
    assert lang in ('pl', 'plm',
                    'cu', 'cum', 'cut', 'cue',
                    'el', 'elm', 'elt'), "invalid lang " + lang

    real_lang = lang[:2]
    transformation = lang[2:]

    basename_with_lang = ("%s/%s" % (basename, real_lang))

    try:
        #TODO maybe open ready metaphone files?
        with file("%s.sentences" % basename_with_lang) as f:
            t = [line.decode('utf-8').strip() for line in f.readlines()]
    except IOError:
        t = Text.from_file("%s.txt" % basename_with_lang,
                           lang=real_lang).as_sentences_flat()

    if transformation:
        if transformation == 'm':
            return [metaphone_text(s, lang=real_lang) for s in t]
        elif transformation == 't':
            return [translit_pl(s, real_lang) for s in t]
        elif transformation == 'e':
            return [expand_cu(s, numbers=True) for s in t]
    return t
示例#2
0
文件: search.py 项目: mik01aj/corthus
def highlight(result, query_string):
    result_words = result.split()
    result_ms = metaphone_text(result).split()
    query_ms = metaphone_text(query_string).split()
    r = []
    last = []
    for i, word, word_m in zip(xrange(len(result_words)),
                               result_words,
                               result_ms):
        if word_m in query_ms:
            r.append(' '.join(last))
            r.append(word)
            last = []
        else:
            last.append(word)
    r.append(' '.join(last))
    assert len(r) % 2 == 1
    return r
示例#3
0
def preprocess(sent, use_metaphone=True):
    sent = re.sub('[¶♦\'=`^]', '', sent)
    sent = re.sub('([.,:;!?])', r' \1 ', sent)
    sent = re.sub('\s+', ' ', sent)
    if use_metaphone:
        sent = metaphone_text(sent, remove_vowels=False, max_length=20)
        sent = re.sub('\s?[-?]\s?', ' ', sent)
        sent = sent.strip()
    else:
        sent = sent.lower()
    return sent
示例#4
0
文件: search.py 项目: mik01aj/corthus
def search(query_string, page_num=1, page_length=10):
    query_string = metaphone_text(query_string)
    ix = whoosh.index.open_dir(INDEX_DIR)
    query = QueryParser("content", ix.schema).parse(query_string)
    with ix.searcher() as searcher:
        global _last_results
        _last_results = searcher.search_page(query, page_num,
                                            pagelen=page_length)
        for result in _last_results:
            [name, lang, sent_num] = result['path'].split(':')
            yield { 'name' : name,
                    'lang' : lang,
                    'sent_num' : int(sent_num) }