Пример #1
0
def index_files(filenames):
    """Creates a search index from given files, and store it in the
    `INDEX_DIR` folder."""

    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
        print "Created folder " + INDEX_DIR

    schema = Schema(path=ID(stored=True, unique=True),
                    content=TEXT)

    #TODO disable stopword removal
    ix = whoosh.index.create_in(INDEX_DIR, schema)
    writer = ix.writer()

    for filename in filenames:
        try:
            print "Adding " + filename
            m = re.match('(.*)/(\w\w).txt+$', filename)
            name = m.group(1) # relative to '.'
            name_rooted = os.path.relpath(m.group(1), start=TEXTS_ROOT)
            lang = m.group(2)
            for sent_num, sent in enumerate(fetch_sentences(name, lang+'m')):
                path_str = ":".join([name_rooted, lang, str(sent_num)])
                writer.add_document(path=path_str,
                                    content=sent)
        except Exception, e:
            print "    Failed to add", filename
            print "   ", e
Пример #2
0
def retrieve_fragment(search_result, query_string, words=50):
    r = search_result # for convenience
    sentences = fetch_sentences(TEXTS_ROOT + '/' + r['name'], r['lang'])

    n = r['sent_num']
    ws = sentences[n].split()
    for offset in range(1, 15):
        if len(ws) > words:
            break
        if n + offset < len(sentences):
            ws = ws + (sentences[n+offset].split())
        if len(ws) > words:
            break
        if n - offset >= 0:
            ws = (sentences[n-offset].split()) + ws
    return highlight(" ".join(ws), query_string)
Пример #3
0
        files = sys.argv[2:]

        start_time = datetime.now()
        index_files(files)
        end_time = datetime.now()
        print "Total time: %s" % (end_time - start_time)

    else:
        from textwrap import wrap

        query_string = ' '.join(sys.argv[1:])
        print "Searching for: " + query_string
        print

        for r in search(query_string):
            print r
            sentences = fetch_sentences(TEXTS_ROOT + '/' + r['name'],
                                        r['lang'])
            result_text = sentences[r['sent_num']]

            s = ""
            for i, fragment in enumerate(highlight(result_text, query_string)):
                if i%2 == 1:
                    s += " \x1b[1;31m%s\x1b[0m " % fragment
                else:
                    s += fragment
            print s
            print