forked from C-960/markdown-search
-
Notifications
You must be signed in to change notification settings - Fork 1
/
search.py
237 lines (198 loc) · 8.6 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import shutil
import html.parser
from markdown_parser import MarkdownParser
import mistune
from whoosh.fields import *
import whoosh.index as index
import whoosh.highlight as hl
import os
import os.path
import codecs
from whoosh.qparser import MultifieldParser, QueryParser
from jieba.analyse import ChineseAnalyzer
class SearchResult:
score = 1.0
path = None
content = ""
content_highlight = ""
headlines = None
tags = ""
class DontEscapeHtmlInCodeRenderer(mistune.Renderer):
def __init__(self, **kwargs):
super(DontEscapeHtmlInCodeRenderer, self).__init__(**kwargs)
def block_code(self, code, lang):
if not lang:
return '<pre><code>%s\n</code></pre>\n' % code
return '<pre><code class="lang-%s">%s\n</code></pre>\n' % (lang, code)
def codespan(self, text):
return '<code>%s</code>' % text.rstrip()
class Search:
ix = None
index_folder = None
markdown = mistune.Markdown(renderer=DontEscapeHtmlInCodeRenderer(), escape=False)
html_parser = html.parser.HTMLParser()
schema = None
def __init__(self, index_folder):
self.open_index(index_folder)
def open_index(self, index_folder, create_new=False):
self.index_folder = index_folder
if create_new:
if os.path.exists(index_folder):
shutil.rmtree(index_folder)
print("deleted index folder: " + index_folder)
if not os.path.exists(index_folder):
os.mkdir(index_folder)
exists = index.exists_in(index_folder)
chinese_analyzer = ChineseAnalyzer()
schema = Schema(
path=ID(stored=True, unique=True)
, filename=TEXT(stored=True, field_boost=100.0)
, tags=KEYWORD(stored=True, scorable=True, field_boost=80.0)
, headlines=KEYWORD(stored=True, scorable=True, field_boost=60.0)
, doubleemphasiswords=KEYWORD(stored=True, scorable=True, field_boost=40.0)
, emphasiswords=KEYWORD(stored=True, scorable=True, field_boost=20.0)
, content=TEXT(stored=True, analyzer=chinese_analyzer)
, time=STORED
)
if not exists:
self.ix = index.create_in(index_folder, schema)
else:
self.ix = index.open_dir(index_folder)
def add_document(self, writer, file_path, config):
file_name = str(file_path.replace(".", " ").replace("/", " ").replace("\\", " ").replace("_", " ").replace("-", " "))
# read file content
with codecs.open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
path = str(file_path)
# parse markdown fields
parser = MarkdownParser()
parser.parse(content, config)
modtime = os.path.getmtime(path)
print("adding to index: path: %s size:%d tags:'%s' headlines:'%s' modtime=%d" % (
path, len(content), parser.tags, parser.headlines, modtime))
writer.add_document(
path=path
, filename=file_name
, headlines=parser.headlines
, tags=parser.tags
, content=content
, doubleemphasiswords=parser.doubleemphasiswords
, emphasiswords=parser.emphasiswords
, time = modtime
)
def add_all_files(self, file_dir, config, create_new_index=False):
if create_new_index:
self.open_index(self.index_folder, create_new=True)
count = 0
writer = self.ix.writer()
for root, dirs, files in os.walk(file_dir, followlinks=True):
for file in files:
if file.endswith(".md") or file.endswith("markdown"):
path = os.path.join(root, file)
self.add_document(writer, path, config)
count += 1
writer.commit()
print("Done, added %d documents to the index" % count)
def update_index_incremental(self, config, create_new_index=False):
file_dir = config["MARKDOWN_FILES_DIR"]
if create_new_index:
self.open_index(self.index_folder, create_new=True)
all_files = []
for root, dirs, files in os.walk(file_dir, followlinks=True):
for file in files:
if file.endswith(".md") or file.endswith("markdown"):
path = os.path.join(root, file)
all_files.append(path)
# see: https://pythonhosted.org/Whoosh/indexing.html#incremental-indexing
# The set of all paths in the index
indexed_paths = set()
# The set of all paths we need to re-index
to_index = set()
count = 0
with self.ix.searcher() as searcher:
writer = self.ix.writer()
# Loop over the stored fields in the index
for fields in searcher.all_stored_fields():
indexed_path = fields['path']
indexed_paths.add(indexed_path)
if not os.path.exists(indexed_path):
# This file was deleted since it was indexed
writer.delete_by_term('path', indexed_path)
print("removed from index: %s" % indexed_path)
else:
# Check if this file was changed since it
# was indexed
indexed_time = fields['time']
mtime = os.path.getmtime(indexed_path)
if mtime > indexed_time:
# The file has changed, delete it and add it to the list of
# files to reindex
writer.delete_by_term('path', indexed_path)
to_index.add(indexed_path)
# Loop over the files in the filesystem
for path in all_files:
if path in to_index or path not in indexed_paths:
# This is either a file that's changed, or a new file
# that wasn't indexed before. So index it!
self.add_document(writer, path, config)
count += 1
writer.commit()
print("Done, updated %d documents in the index" % count)
def create_search_result(self, results):
# Allow larger fragments
results.fragmenter.maxchars = 300
# Show more context before and after
results.fragmenter.surround = 50
# Set result formatter
results.formatter = hl.HtmlFormatter(tagname="mark")
search_results = []
for r in results:
sr = SearchResult()
sr.score = r.score
sr.tags = r["tags"]
sr.path = r["path"]
sr.content = r["content"]
highlights = r.highlights("content")
if not highlights:
highlights = self.cap(r["content"], 500)
# unescape
highlights = self.html_parser.unescape(highlights)
html = self.markdown(highlights)
sr.content_highlight = html
if "headlines" in r:
sr.headlines = r["headlines"]
search_results.append(sr)
return search_results
def cap(self, s, l):
return s if len(s) <= l else s[0:l - 3] + '...'
def get_tags(self):
with self.ix.searcher() as searcher:
return list(searcher.lexicon("tags"))
def search(self, query_list, fields=None):
with self.ix.searcher() as searcher:
query_string = " ".join(query_list)
query = None
if "\"" in query_string or ":" in query_string:
query = QueryParser("content", self.schema).parse(query_string)
elif len(fields) == 1 and fields[0] == "filename":
pass
elif len(fields) == 1 and fields[0] == "tags":
pass
elif len(fields) == 2:
pass
else:
fields = ["tags", "headlines", "content", "filename", "doubleemphasiswords", "emphasiswords"]
if not query:
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
parsed_query = "%s" % query
print("query: %s" % parsed_query)
results = searcher.search(query, terms=False, scored=True, groupedby="path")
key_terms = results.key_terms("tags", docs=100, numterms=100)
tag_cloud = [keyword for keyword, score in key_terms]
search_result = self.create_search_result(results)
return parsed_query, search_result, tag_cloud
def get_document_total_count(self):
return self.ix.searcher().doc_count_all()
if __name__ == "__main__":
search = Search("search_index")
search.add_all_files("/Volumes/data/doc/wiki/dev")