def index_files(filenames): """Creates a search index from given files, and store it in the `INDEX_DIR` folder.""" if not os.path.exists(INDEX_DIR): os.mkdir(INDEX_DIR) print "Created folder " + INDEX_DIR schema = Schema(path=ID(stored=True, unique=True), content=TEXT) #TODO disable stopword removal ix = whoosh.index.create_in(INDEX_DIR, schema) writer = ix.writer() for filename in filenames: try: print "Adding " + filename m = re.match('(.*)/(\w\w).txt+$', filename) name = m.group(1) # relative to '.' name_rooted = os.path.relpath(m.group(1), start=TEXTS_ROOT) lang = m.group(2) for sent_num, sent in enumerate(fetch_sentences(name, lang+'m')): path_str = ":".join([name_rooted, lang, str(sent_num)]) writer.add_document(path=path_str, content=sent) except Exception, e: print " Failed to add", filename print " ", e
def retrieve_fragment(search_result, query_string, words=50): r = search_result # for convenience sentences = fetch_sentences(TEXTS_ROOT + '/' + r['name'], r['lang']) n = r['sent_num'] ws = sentences[n].split() for offset in range(1, 15): if len(ws) > words: break if n + offset < len(sentences): ws = ws + (sentences[n+offset].split()) if len(ws) > words: break if n - offset >= 0: ws = (sentences[n-offset].split()) + ws return highlight(" ".join(ws), query_string)
files = sys.argv[2:] start_time = datetime.now() index_files(files) end_time = datetime.now() print "Total time: %s" % (end_time - start_time) else: from textwrap import wrap query_string = ' '.join(sys.argv[1:]) print "Searching for: " + query_string print for r in search(query_string): print r sentences = fetch_sentences(TEXTS_ROOT + '/' + r['name'], r['lang']) result_text = sentences[r['sent_num']] s = "" for i, fragment in enumerate(highlight(result_text, query_string)): if i%2 == 1: s += " \x1b[1;31m%s\x1b[0m " % fragment else: s += fragment print s print