Python IndexWriter.updateDocument 예제들, org.apache.lucene.index.IndexWriter.updateDocument Python 예제들

예제 #1

0

파일 보기

파일: lucenedriver.py 프로젝트: bradleyjones/apiary

    def updateindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc)

        writer.optimize()
        writer.close()

예제 #2

0

파일 보기

class IndexBuilder(object):
    def __init__(self, index_path, update=False):
        dir = FSDirectory.open(Paths.get(index_path))
        analyzer = StandardAnalyzer()
        iwc = IndexWriterConfig(analyzer)
        if update:
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        else:
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(dir, iwc)

    def index_docs(self, input_documents):
        for document in tqdm(input_documents, total=len(input_documents)):
            doc = Document()
            doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES))
            doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES))
            type = FieldType()
            type.setIndexOptions(
                IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
            type.setStored(True)
            type.setStoreTermVectors(True)
            type.setTokenized(True)
            if ".W" in document and ".M" in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".M"].lower() + " " +
                                               document[".T"].lower() +
                                               document[".W"].lower())), type))
            elif ".M" in document and ".W" not in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".M"].lower() + " " +
                                               document[".T"].lower())), type))
            elif ".M" not in document and ".W" in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".T"].lower() +
                                               document[".W"].lower())), type))
            elif ".M" not in document and ".W" not in document:
                doc.add(
                    Field("text",
                          " ".join(tokenizer.tokenize(document[".T"].lower())),
                          type))
            if self.writer.getConfig().getOpenMode(
            ) == IndexWriterConfig.OpenMode.CREATE:
                self.writer.addDocument(doc)
            else:
                self.writer.updateDocument(Term(".U", document[".U"]), doc)
        self.writer.close()

예제 #3

0

파일 보기

    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

예제 #4

0

파일 보기

파일: views.py 프로젝트: kevkid/YIF

def survey(request):
    ipAddr = get_client_ip(request)
    instances = (Classes.objects.values_list('image_class_desc'))
    instances = [i[0] for i in instances]
    #cnt = len(instances)
    #lets get out choice
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    
        
    try:
        #image_class = image.objects.get(pk=request.POST['survey'])
        s = request.POST['survey']#get from post
        
                
    except (KeyError, Classes.DoesNotExist):
        return render(request, 'web/index.html',{
            'error_message': "You didn't select a choice.",
        })
    else:
        image_class = instances[int(s)]
        docNum = request.POST['imageID']#get document id
        doc = reader.document(int(docNum))
        fname = doc.get("filename")
        print(fname)
        #SimpleFSDirectory(File(location)).clearLock(IndexWriter.WRITE_LOCK_NAME);
        fileClassField = doc.get("Classification")
        if str(fileClassField) == "None":#check if the field exists####NEED TO CHECK THIS
            fileClassField = str(ipAddr + ":" + image_class)#I think we must add an ip address to this
        else:
            fileClassField = str(ipAddr + ":" + fileClassField) + ", " + image_class
            
        #doc.removeField("Classification")
        
        #doc.add(StringField("Classification", fileClassField, Field.Store.YES))
        #t = doc.get("Classification")
        #reader.close()
        indexDir = SimpleFSDirectory(File(location))
        writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
        writer = IndexWriter(indexDir, writerConfig)
        fields = doc.getFields()#get all fields
        doc2 = Document()
        classificationFieldFlag = False
        for f in fields:
            field = Field.cast_(f)
            (k, v) = field.name(), field.stringValue()
            if k == "Classification":
                classificationFieldFlag = True
                field = StringField("Classification", fileClassField, Field.Store.YES)
                doc2.add(field)
            else:
                doc2.add(field)

        if classificationFieldFlag == False:#this does not exist in the document must add
            doc2.add(StringField("Classification", fileClassField, Field.Store.YES))
#         doc2.add(StringField("Classification", fileClassField, Field.Store.YES))
#         doc2.add(StringField("fid", doc.get("fid"), Field.Store.YES))
#         doc2.add(StringField("articleid", doc.get("articleid"), Field.Store.YES))
#         doc2.add(StringField("caption", doc.get("caption"), Field.Store.YES))
#         doc2.add(StringField("figureid", doc.get("figureid"), Field.Store.YES))
#         doc2.add(StringField("filename", doc.get("filename"), Field.Store.YES))
#         doc2.add(StringField("filepath", doc.get("filepath"), Field.Store.YES))
#         doc2.add(StringField("label", doc.get("label"), Field.Store.YES))
        
        #writer.updateDocument(Term("fid","f000000000023"), doc2)#If field exists update
        writer.updateDocument(Term("fid", doc.get("fid")), doc2)#If field exists update
        writer.commit();
        #writer.optimize()
        writer.close()
        #writer.unlock(SimpleFSDirectory(File(location)))
        
    return HttpResponseRedirect(reverse('web:index', args=()))

예제 #5

0

파일 보기

파일: FancyAnalyzer.py 프로젝트: martinjuhasz/HSRM_KI_Projekt

Berlin - "Gentrifizierung, die: Aufwertung eines Stadtteils durch dessen Sanierung oder Umbau mit der Folge, dass die dort ansässige Bevölkerung durch wohlhabendere Bevölkerungsschichten verdrängt wird."

So beschreibt der Duden ein Phänomen, das vor allem in Großstädten zu beobachten ist: Viele Menschen ziehen in günstige Wohnviertel, durch die Nachfrage steigen die Preise, bis die Mieten schließlich nur noch für die Wohlhabenderen bezahlbar sind - und die Künstler, Studenten, Geringverdiener weichen müssen. In der Weihnachtszeit hat nun ein Fall aus Berlin für Aufsehen gesorgt, der von einigen als Beispiel für Gentrifizierung genannt wird. Auf Facebook hat ein Nutzer die aktuelle Infobroschüre einer evangelischen Kirchengemeinde in Berlin-Mitte gepostet, Seite 25, Rubrik "Taufen". 29 Namen von Kindern und Erwachsenen sind dort zu lesen, darunter die folgenden:
Viktor Paul Theodor, Ada Mai Helene, Rufus Oliver Friedrich, Cäcilie Helene, Edvard Neo, Freya Luise Apollonia, Frederick Theodor Heinrich, Leonore Anna Maria Chiara Helena. Viele der Nachnamen lassen zudem auf einen adligen Hintergrund schließen.

"Das Comeback alter Adelsgeschlechter in Berlin-Mitte = Gentrifizierung im eigentlichen Sinne", lautet ein Kommentar unter dem Facebook-Foto. "In Dresden Gorbitz sähe die Liste anders aus", schreibt ein anderer Nutzer. Ein dritter fasst zusammen: "So heißt also die Gentrifizierung."

Im "Gentrification Blog", betrieben von einem wissenschaftlichen Mitarbeiter der Berliner Humboldt-Universität, ist vor kurzem ein Beitrag zu der Taufliste erschienen: "Berlin: Am Taufbecken der Gentrification - Kirche im Aufwertungsgebiet" heißt er. Die Liste lese sich "wie eine Mischung aus FDP-Wahlliste für das Europaparlament und dem Verzeichnis der höheren Beamten des Diplomatischen Dienstes", heißt es in dem Artikel. Und: "Der Wortsinn der Gentrification - der ja auf die Wiederkehr des niederen Landadels (der Gentry) in den Städten anspielt - bekommt hier jedenfalls einen unerwarteten Realitätsgehalt."

Zu dem Artikel stellte der Autor eine Taufliste aus dem Jahr 2007 aus einer Gemeinde im benachbarten Stadtteil Prenzlauer Berg. Darauf sind unter anderem diese Namen zu finden: Ruby, Matteo, Iwan, Lennart, Emilia, Annabelle, Andreas, Anke.

Jene Kirchgemeinde, in der die mondänen Namen zur Taufe aufgeführt sind, listet auch die Verstorbenen auf. Zwei sind es in der aktuellen Infobroschüre. Nzitu. Und: Herbert."""

    lucene.initVM()

    # language processor and storage
    analyzer = PorterStemmerAnalyzer(Version.LUCENE_CURRENT)
    store = SimpleFSDirectory(File('./data-test/'))

    # writes data to the index
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer, overwrite=True)
    writer = IndexWriter(store, config)

    # add Document
    doc = Document()
    doc.add(Field('content', test_text, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field('url', "http://test.com", Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.updateDocument(Term("url", "http://test.com"), doc)

    writer.commit()
    writer.close()

예제 #6

0

파일 보기

class LuceneManager(object):
    def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'):
        self.index_root_loc = index_root_loc
        self.index_subdir_name = index_subdir_name

    def __enter__(self):
        """
        Used by "with" statement. Like an "open" / "init" method.
        """
        if lucene.getVMEnv() is None:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        index_path = Path(self.index_root_loc).joinpath('%s/' %
                                                        self.index_subdir_name)
        index_path.mkdir(parents=True, exist_ok=True)
        store = SimpleFSDirectory(Paths.get(str(index_path)))
        self.analyzer = StandardAnalyzer()
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # IndexWriter
        self.writer = IndexWriter(store, config)
        # IndexReader
        self.reader = DirectoryReader.open(self.writer)
        # IndexSearcher
        self.searcher = IndexSearcher(self.reader)

        return self

    def insert(self, document):
        self.writer.addDocument(document)
        return document['key']

    def delete(self, key):
        self.writer.deleteDocuments(Term('key', key))
        return key

    def delete_all(self):
        self.writer.deleteAll()

    def num_docs(self):
        return self.reader.numDocs()

    def update(self, key, document):
        # atomic delete and add
        self.writer.updateDocument(Term('key', key), document)
        return key

    def exists(self, key):
        boolean_query = BooleanQuery.Builder()
        boolean_query.add(TermQuery(Term('key', key)),
                          BooleanClause.Occur.MUST)
        results = self.searcher.search(boolean_query.build(), 1)
        return results.totalHits > 0

    def commit(self):
        self.writer.commit()
        # make IndexReader reflect index updates
        # TODO: try IndexReader.isCurrent()
        new_reader = DirectoryReader.openIfChanged(self.reader)
        if new_reader is not None:
            self.reader.close()  # note: not thread safe, may need to revisit
            self.reader = new_reader
            self.searcher = IndexSearcher(self.reader)

    def _process_search_result(self, result, highlighter=None):
        docid = result.doc  # this is not a stable identifier
        # obtain document through an IndexReader
        doc = self.searcher.doc(docid)
        # doc.getFields() -> field.name(), field.stringValue()
        # use highlighter to extract relevant part of body
        highlighted_text = ''
        if highlighter:
            contents = doc['body']
            token_stream = self.analyzer.tokenStream('body', contents)
            n_fragments = 3
            fragment_separator = '...'
            highlighted_text = highlighter.getBestFragments(
                token_stream, contents, n_fragments, fragment_separator)
        return {
            'fullpath': doc['fullpath'],
            'last_modified_time': doc['last_modified_time'],
            'score': result.score,
            'excerpt': highlighted_text
        }

    def search(self, terms, n_hits=5):
        """
        Run search query.
        """
        # TODO: support date range queries

        # build query
        parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer)
        #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier
        query = MultiFieldQueryParser.parse(
            parser, terms)  # https://stackoverflow.com/a/26853987/130164
        # create a highlighter
        highlighter = Highlighter(SimpleHTMLFormatter('*', '*'),
                                  QueryScorer(query))
        # execute search for top N hits
        return [
            self._process_search_result(result, highlighter)
            for result in self.searcher.search(query, n_hits).scoreDocs
        ]

    def get_all_docs(self, n_hits=1000):
        # debug method
        return [
            self._process_search_result(result) for result in
            self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs
        ]

    def __exit__(self, type, value, traceback):
        """
        Used by the "with" statement. Handles close.
        TODO: error handling
        """
        self.writer.close()
        self.reader.close()

    def debug_analyzer(self, text):
        """
        Debug what StandardAnalyzer will give on this text.
        Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html
        Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py
        """
        token_stream = self.analyzer.tokenStream('field', text)
        termAtt = token_stream.getAttribute(CharTermAttribute.class_)
        token_stream.reset()
        tokens = []
        while token_stream.incrementToken():
            #tokens.append(token_stream.reflectAsString(True))
            tokens.append(termAtt.toString())
        token_stream.end()
        token_stream.close()
        return tokens

예제 #7

0

파일 보기

파일: LuceneIndexer.py 프로젝트: martinjuhasz/HSRM_KI_Projekt

class LuceneIndexer(object):
    def __init__(self):
        lucene.initVM()

        # language processor and storage
        self.analyzer = GermanAnalyzer(Version.LUCENE_CURRENT)
        self.store = SimpleFSDirectory(File('./../Lucene/data/'))

        # writes data to the index
        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer, overwrite=True)
        self.writer = IndexWriter(self.store, config)


    def add_article(self, article):
        # constructing a document
        doc = Document()

        title = Field('title', article.title, Field.Store.YES, Field.Index.ANALYZED)
        title.setBoost(10.0)
        doc.add(title)

        description = Field('description', article.description, Field.Store.YES, Field.Index.ANALYZED)
        description.setBoost(5.0)
        doc.add(description)

        doc.add(Field('keywords', article.keywords, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', article.content, Field.Store.YES, Field.Index.ANALYZED))
        if article.date:
            doc.add(Field('date', article.date, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if article.last_modified:
            doc.add(Field('last_modified', article.last_modified, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if article.images:
            doc.add(Field('image_url', article.images[0][0], Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field('image_text', article.images[0][1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field('url', article.url, Field.Store.YES, Field.Index.NOT_ANALYZED))

        # creates document or updates if already exists
        self.writer.updateDocument(Term("url", article.url), doc)

    def write_to_file(self):
        # making changes permanent
        self.writer.commit()
        self.writer.close()



    def perform_search(self, searchterm):
        # processing a query
        parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)

        query = parser.parse(searchterm)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start

        print scoreDocs
        print duration

예제 #8

0

파일 보기

파일: index.py 프로젝트: jerryba/meresco-lucene

class Index(object):
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())

    def addDocument(self, term, document):
        document = self._facetsConfig.build(self._taxoWriter, document)
        self._indexWriter.updateDocument(term, document)

    def deleteDocument(self, term):
        self._indexWriter.deleteDocuments(term)

    def search(self, query, filter, collector):
        self._indexAndTaxonomy.searcher.search(query, filter, collector)

    def suggest(self, query, count, field):
        suggestions = {}
        for token, startOffset, endOffset in self._analyzeToken(query):
            suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader())
            if suggestWords:
                suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords])
        return suggestions

    def termsForField(self, field, prefix=None, limit=10, **kwargs):
        convert = lambda term: term.utf8ToString()
        terms = []
        termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
        if termsEnum is None:
            return terms
        iterator = termsEnum.iterator(None)
        if prefix:
            iterator.seekCeil(BytesRef(prefix))
            terms.append((iterator.docFreq(), convert(iterator.term())))
        bytesIterator = BytesRefIterator.cast_(iterator)
        try:
            while len(terms) < limit:
                term = convert(bytesIterator.next())
                if prefix and not term.startswith(prefix):
                    break
                terms.append((iterator.docFreq(), term))
        except StopIteration:
            pass
        return terms

    def fieldnames(self):
        indexAndTaxonomy = self._indexAndTaxonomy
        fieldnames = []
        fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader())
        if fields is None:
            return fieldnames
        iterator = fields.iterator()
        while iterator.hasNext():
            fieldnames.append(iterator.next())
        return fieldnames

    def drilldownFieldnames(self, path=None, limit=50):
        taxoReader = self._indexAndTaxonomy.taxoReader
        parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:])
        childrenIter = taxoReader.getChildren(parentOrdinal)
        names = []
        while True:
            ordinal = childrenIter.next()
            if ordinal == TaxonomyReader.INVALID_ORDINAL:
                break
            names.append(taxoReader.getPath(ordinal).components[-1])
            if len(names) >= limit:
                break
        return names

    def numDocs(self):
        return self._indexAndTaxonomy.searcher.getIndexReader().numDocs()

    def commit(self):
        if not self._settings.readonly:
            self._taxoWriter.commit()
            self._indexWriter.commit()
        self._indexAndTaxonomy.reopen()

    def getDocument(self, docId):
        return self._indexAndTaxonomy.searcher.doc(docId)

    def createFacetCollector(self):
        if not self._multithreaded:
            return FacetsCollector()
        return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader)

    def facetResult(self, facetCollector):
        facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector)
        return Facets.cast_(facetResult)

    def close(self):
        self._indexAndTaxonomy.close()
        if not self._settings.readonly:
            self._taxoWriter.close()
            self._indexWriter.close()

    def _analyzeToken(self, token):
        result = []
        reader = StringReader(unicode(token))
        stda = self._settings.analyzer
        ts = stda.tokenStream("dummy field name", reader)
        termAtt = ts.addAttribute(CharTermAttribute.class_)
        offsetAtt = ts.addAttribute(OffsetAttribute.class_)
        try:
            ts.reset()
            while ts.incrementToken():
                result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset()))
            ts.end()
        finally:
            ts.close()
        return result

예제 #9

0

파일 보기

파일: text_indexer.py 프로젝트: BitCurator/bca-webtools

class ImageIndexer(object):
    """Given an image details the indexer will get all text files, lucene them
    for search and retrieval."""
    hash_field = FieldType()
    hash_field.setStored(True)
    hash_field.setTokenized(False)
    hash_field.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    text_field = FieldType()
    text_field.setStored(False)
    text_field.setTokenized(True)
    text_field.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    mime_map = MimeMapper("/var/www/bcaw/conf/mimemap.conf")

    def __init__(self, store_dir):
        self.store_dir = store_dir
        if not os.path.exists(store_dir):
            os.mkdir(store_dir, 0777)
        self.store = SimpleFSDirectory(Paths.get(store_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(self.analyzer)
        self.writer = IndexWriter(self.store, self.config)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.writer.close()

    def index_text(self, sha1, full_text):
        """Index the full text and map it to the source sha1."""
        document = Document()
        document.add(Field("sha1", sha1, ImageIndexer.hash_field))
        if full_text:
            document.add(Field("full_text", full_text, ImageIndexer.text_field))
            self.writer.updateDocument(Term("sha1", sha1), document)
        else:
            logging.info("No text for sha1 %s", sha1)

    @classmethod
    def get_path_details(cls, temp_path, image_path):
        """Return the byte sequence and the full text for a given path."""
        byte_sequence = ByteSequence.from_path(temp_path)
        extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
        logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
                      extension, byte_sequence.sha1)
        full_text = ""
        if extension is not None:
            try:
                logging.debug("Textract for SHA1 %s, extension map val %s",
                              byte_sequence.sha1, extension)
                full_text = process(temp_path, extension=extension, encoding='ascii',
                                    preserveLineBreaks=True)
            except ExtensionNotSupported as _:
                logging.exception("Textract extension not supported for ext %s", extension)
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except LookupError as _:
                logging.exception("Lookup error for encoding.")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except UnicodeDecodeError as _:
                logging.exception("UnicodeDecodeError, problem with file encoding")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except:
                logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
        return byte_sequence, full_text

    def index_path(self, temp_path, image_path):
        """Index the full text of the file and map it to the file's sha1 and return
        the derived ByteStream object and derived full text as a tuple."""
        byte_sequence, full_text = self.get_path_details(temp_path, image_path)
        if full_text:
            self.index_text(byte_sequence.sha1, full_text)
        return byte_sequence, full_text

예제 #10

0

파일 보기

파일: convert2index.py 프로젝트: Sean-Chuang/Lucene-Java-Python

class LuceneHelper:
    def __init__(self, index_dir):
        self.index_dir = index_dir
        self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath())
        self.q_parser = QueryParser("", WhitespaceAnalyzer())
        self.commit_max = 500000
        self.__get_writer_searcher()

    def __get_writer_searcher(self):
        writerConfig = IndexWriterConfig()
        print(f"Codec : {writerConfig.getCodec()}")
        self.writer = IndexWriter(self.indexDir, writerConfig)

        self.reader = DirectoryReader.open(self.writer)
        self.searcher = IndexSearcher(self.reader)

    def __query(self, query_str, _max=10):
        if self.searcher is None:
            return None
        query_cmd = self.q_parser.parse(query_str)
        hits = self.searcher.search(query_cmd, _max)
        print(
            f"Found {hits.totalHits} document(s) that matched query :'{query_cmd}'"
        )
        return hits

    def __count_docs(self, query_str):
        if self.searcher is None:
            return None
        query_cmd = self.q_parser.parse(query_str)
        total = self.searcher.count(query_cmd)
        print(f"Found {total} document(s) that matched query :'{query_cmd}'")
        return total

    def refresh_searcher(self):
        self.reader.close()
        self.reader = DirectoryReader.open(self.indexDir)
        self.searcher = IndexSearcher(self.reader)

    def index_stats(self):
        query_str = f"*:*"
        total_docs = self.__count_docs(query_str)
        if total_docs:
            print(f"There is at least total [{total_docs}] docs.")
        else:
            print("There is no index right now.")

    def delete_old_ttl(self):
        now_time = int(time.time())
        # check how many docs expired
        ttl_query = LongPoint.newRangeQuery("ttl", 0, now_time - 1)
        total_docs = self.searcher.count(ttl_query)
        print(f"At least found {total_docs} document(s) are expired.")
        # delete expired docs
        self.writer.deleteDocuments(ttl_query)
        self.writer.commit()

    def add_doc(self, item_data):
        item_id = item_data['item_id']
        ttl = item_data['ttl']
        version = item_data.get('version', 'default')
        view_similar = json.dumps(item_data.get('view_similar', {}))
        view_prospective = json.dumps(item_data.get('view_prospective', {}))

        doc = Document()
        _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest()
        doc.add(StringField("id", _id, Field.Store.NO))
        doc.add(LongPoint("ttl", ttl))
        doc.add(StringField("version", version, Field.Store.YES))
        doc.add(StringField("item_id", item_id, Field.Store.YES))
        doc.add(StoredField("view_similar", view_similar))
        doc.add(StoredField("view_prospective", view_prospective))
        self.writer.updateDocument(Term("id", _id), doc)

    def commit(self):
        self.writer.commit()

    def close(self):
        self.writer.commit()
        self.reader.close()
        self.writer.close()