예제 #1
0
class TermNumerator(Observable):

    def __init__(self, path, lruTaxonomyWriterCacheSize=100):
        Observable.__init__(self)
        taxoDirectory = MMapDirectory(File(path))
        taxoDirectory.setUseUnmap(False)
        self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))

    def numerateTerm(self, term):
        if not term:
            return
        return self._taxoWriter.addCategory(FacetLabel([term]))

    def getTerm(self, nr):
        if not hasattr(self, "_taxoReader"):
            self._taxoReader = DirectoryTaxonomyReader(self._taxoWriter)
        tr = DirectoryTaxonomyReader.openIfChanged(self._taxoReader)
        if tr:
            self._taxoReader.close()
            self._taxoReader = tr
        return self._taxoReader.getPath(nr).components[0]

    def handleShutdown(self):
        print 'handle shutdown: saving TermNumerator'
        from sys import stdout; stdout.flush()
        self._taxoWriter.commit()

    def close(self):
        self._taxoWriter.close()
예제 #2
0
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
예제 #3
0
    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
예제 #4
0
    def index(cls, indexDir, taxoDir, facets_config):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        config = IndexWriterConfig(Version.LUCENE_48,
                                   WhitespaceAnalyzer(Version.LUCENE_48))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir,
                                       IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # obtain the sample facets for current document
            facets = categories[docNum]
            author = authors[docNum]
            # ... and use the FacetField class for adding facet fields to
            # the Lucene document (and via FacetsConfig to the taxonomy index)
            doc.add(FacetField("Author", author))
            for f in facets:
                doc.add(FacetField("Categories", f))
            # finally add the document to the index
            iw.addDocument(facets_config.build(taxo, doc))
            nDocsAdded += 1

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        iw.close()
        taxo.close()
        print "Indexed %d documents with facets." % nDocsAdded
예제 #5
0
    def index (cls, indexDir, taxoDir, facets_config):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        config = IndexWriterConfig(Version.LUCENE_48,
                                   WhitespaceAnalyzer(Version.LUCENE_48))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # obtain the sample facets for current document
            facets = categories[docNum]
            author = authors[docNum]
            # ... and use the FacetField class for adding facet fields to
            # the Lucene document (and via FacetsConfig to the taxonomy index)
            doc.add(FacetField("Author", [author]));
            for f in facets:
                doc.add(FacetField("Categories", f))
            # finally add the document to the index
            iw.addDocument(facets_config.build(taxo, doc));
            nDocsAdded += 1

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        iw.close()
        taxo.close()
        print "Indexed %d documents with facets." % nDocsAdded
예제 #6
0
 def __init__(self, path, lruTaxonomyWriterCacheSize=100):
     Observable.__init__(self)
     taxoDirectory = MMapDirectory(File(path))
     taxoDirectory.setUseUnmap(False)
     self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))
예제 #7
0
class Index(object):
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())

    def addDocument(self, term, document):
        document = self._facetsConfig.build(self._taxoWriter, document)
        self._indexWriter.updateDocument(term, document)

    def deleteDocument(self, term):
        self._indexWriter.deleteDocuments(term)

    def search(self, query, filter, collector):
        self._indexAndTaxonomy.searcher.search(query, filter, collector)

    def suggest(self, query, count, field):
        suggestions = {}
        for token, startOffset, endOffset in self._analyzeToken(query):
            suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader())
            if suggestWords:
                suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords])
        return suggestions

    def termsForField(self, field, prefix=None, limit=10, **kwargs):
        convert = lambda term: term.utf8ToString()
        terms = []
        termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
        if termsEnum is None:
            return terms
        iterator = termsEnum.iterator(None)
        if prefix:
            iterator.seekCeil(BytesRef(prefix))
            terms.append((iterator.docFreq(), convert(iterator.term())))
        bytesIterator = BytesRefIterator.cast_(iterator)
        try:
            while len(terms) < limit:
                term = convert(bytesIterator.next())
                if prefix and not term.startswith(prefix):
                    break
                terms.append((iterator.docFreq(), term))
        except StopIteration:
            pass
        return terms

    def fieldnames(self):
        indexAndTaxonomy = self._indexAndTaxonomy
        fieldnames = []
        fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader())
        if fields is None:
            return fieldnames
        iterator = fields.iterator()
        while iterator.hasNext():
            fieldnames.append(iterator.next())
        return fieldnames

    def drilldownFieldnames(self, path=None, limit=50):
        taxoReader = self._indexAndTaxonomy.taxoReader
        parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:])
        childrenIter = taxoReader.getChildren(parentOrdinal)
        names = []
        while True:
            ordinal = childrenIter.next()
            if ordinal == TaxonomyReader.INVALID_ORDINAL:
                break
            names.append(taxoReader.getPath(ordinal).components[-1])
            if len(names) >= limit:
                break
        return names

    def numDocs(self):
        return self._indexAndTaxonomy.searcher.getIndexReader().numDocs()

    def commit(self):
        if not self._settings.readonly:
            self._taxoWriter.commit()
            self._indexWriter.commit()
        self._indexAndTaxonomy.reopen()

    def getDocument(self, docId):
        return self._indexAndTaxonomy.searcher.doc(docId)

    def createFacetCollector(self):
        if not self._multithreaded:
            return FacetsCollector()
        return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader)

    def facetResult(self, facetCollector):
        facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector)
        return Facets.cast_(facetResult)

    def close(self):
        self._indexAndTaxonomy.close()
        if not self._settings.readonly:
            self._taxoWriter.close()
            self._indexWriter.close()

    def _analyzeToken(self, token):
        result = []
        reader = StringReader(unicode(token))
        stda = self._settings.analyzer
        ts = stda.tokenStream("dummy field name", reader)
        termAtt = ts.addAttribute(CharTermAttribute.class_)
        offsetAtt = ts.addAttribute(OffsetAttribute.class_)
        try:
            ts.reset()
            while ts.incrementToken():
                result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset()))
            ts.end()
        finally:
            ts.close()
        return result