-
Notifications
You must be signed in to change notification settings - Fork 0
/
lucene_indexer.py
53 lines (45 loc) · 1.53 KB
/
lucene_indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import lucene
from java.nio.file import Paths
from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo, IndexOptions
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.fa import PersianAnalyzer
from xlrd import open_workbook
wb = open_workbook('./QA-samples.xlsx')
sheet0 = wb.sheet_by_index(0)
sheet1 = wb.sheet_by_index(1)
print('initializing Lucene VM')
lucene.initVM()
print('lucene version ', lucene.VERSION)
index_path = Paths.get('./lucene.index')
question_field = 'question'
answer_field = 'answer'
index_store = SimpleFSDirectory(index_path)
# analyzer = StandardAnalyzer()
analyzer = PersianAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(index_store, config)
TokenizeFields = True
# Question field type
qft = FieldType()
# qft.setIndexed(True) # todo
qft.setStored(True)
qft.setTokenized(TokenizeFields)
qft.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
# Answer field type
aft = FieldType()
# aft.setIndexed(False) # todo
aft.setStored(True)
writer.deleteAll()
for row in range(1, sheet1.nrows):
doc = Document()
row_q = str(sheet1.cell(row, 0).value)
row_a = str(sheet0.cell(row, 1).value)
doc.add(Field(question_field, row_q, qft))
doc.add(Field(answer_field, row_a, aft))
writer.addDocument(doc)
writer.commit()
writer.close()
print('indexing completed')