예제 #1
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)
        similarity=FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

    ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res= IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
예제 #2
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT,
                                         analyzer)
        similarity = FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res = IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
예제 #3
0
class LuceneManager(object):
    def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'):
        self.index_root_loc = index_root_loc
        self.index_subdir_name = index_subdir_name

    def __enter__(self):
        """
        Used by "with" statement. Like an "open" / "init" method.
        """
        if lucene.getVMEnv() is None:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        index_path = Path(self.index_root_loc).joinpath('%s/' %
                                                        self.index_subdir_name)
        index_path.mkdir(parents=True, exist_ok=True)
        store = SimpleFSDirectory(Paths.get(str(index_path)))
        self.analyzer = StandardAnalyzer()
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # IndexWriter
        self.writer = IndexWriter(store, config)
        # IndexReader
        self.reader = DirectoryReader.open(self.writer)
        # IndexSearcher
        self.searcher = IndexSearcher(self.reader)

        return self

    def insert(self, document):
        self.writer.addDocument(document)
        return document['key']

    def delete(self, key):
        self.writer.deleteDocuments(Term('key', key))
        return key

    def delete_all(self):
        self.writer.deleteAll()

    def num_docs(self):
        return self.reader.numDocs()

    def update(self, key, document):
        # atomic delete and add
        self.writer.updateDocument(Term('key', key), document)
        return key

    def exists(self, key):
        boolean_query = BooleanQuery.Builder()
        boolean_query.add(TermQuery(Term('key', key)),
                          BooleanClause.Occur.MUST)
        results = self.searcher.search(boolean_query.build(), 1)
        return results.totalHits > 0

    def commit(self):
        self.writer.commit()
        # make IndexReader reflect index updates
        # TODO: try IndexReader.isCurrent()
        new_reader = DirectoryReader.openIfChanged(self.reader)
        if new_reader is not None:
            self.reader.close()  # note: not thread safe, may need to revisit
            self.reader = new_reader
            self.searcher = IndexSearcher(self.reader)

    def _process_search_result(self, result, highlighter=None):
        docid = result.doc  # this is not a stable identifier
        # obtain document through an IndexReader
        doc = self.searcher.doc(docid)
        # doc.getFields() -> field.name(), field.stringValue()
        # use highlighter to extract relevant part of body
        highlighted_text = ''
        if highlighter:
            contents = doc['body']
            token_stream = self.analyzer.tokenStream('body', contents)
            n_fragments = 3
            fragment_separator = '...'
            highlighted_text = highlighter.getBestFragments(
                token_stream, contents, n_fragments, fragment_separator)
        return {
            'fullpath': doc['fullpath'],
            'last_modified_time': doc['last_modified_time'],
            'score': result.score,
            'excerpt': highlighted_text
        }

    def search(self, terms, n_hits=5):
        """
        Run search query.
        """
        # TODO: support date range queries

        # build query
        parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer)
        #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier
        query = MultiFieldQueryParser.parse(
            parser, terms)  # https://stackoverflow.com/a/26853987/130164
        # create a highlighter
        highlighter = Highlighter(SimpleHTMLFormatter('*', '*'),
                                  QueryScorer(query))
        # execute search for top N hits
        return [
            self._process_search_result(result, highlighter)
            for result in self.searcher.search(query, n_hits).scoreDocs
        ]

    def get_all_docs(self, n_hits=1000):
        # debug method
        return [
            self._process_search_result(result) for result in
            self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs
        ]

    def __exit__(self, type, value, traceback):
        """
        Used by the "with" statement. Handles close.
        TODO: error handling
        """
        self.writer.close()
        self.reader.close()

    def debug_analyzer(self, text):
        """
        Debug what StandardAnalyzer will give on this text.
        Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html
        Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py
        """
        token_stream = self.analyzer.tokenStream('field', text)
        termAtt = token_stream.getAttribute(CharTermAttribute.class_)
        token_stream.reset()
        tokens = []
        while token_stream.incrementToken():
            #tokens.append(token_stream.reflectAsString(True))
            tokens.append(termAtt.toString())
        token_stream.end()
        token_stream.close()
        return tokens
analyzer = PersianAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(index_store, config)

TokenizeFields = True

# Question field type

qft = FieldType()
# qft.setIndexed(True)  # todo
qft.setStored(True)
qft.setTokenized(TokenizeFields)
qft.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

# Answer field type
aft = FieldType()
# aft.setIndexed(False)  # todo
aft.setStored(True)
writer.deleteAll()
for row in range(1, sheet1.nrows):
    doc = Document()
    row_q = str(sheet1.cell(row, 0).value)
    row_a = str(sheet0.cell(row, 1).value)
    doc.add(Field(question_field, row_q, qft))
    doc.add(Field(answer_field, row_a, aft))
    writer.addDocument(doc)
writer.commit()
writer.close()
print('indexing completed')