def main(): """An example of using the Indexer wrapper. """ # TODO Command line argument passing # TODO e.g. # TODO -d directory to store index in # TODO -i directory to recursively index import time tt = time.time() filedir = 'aesop' indexName = 'aesopind' if os.path.exists(indexName): for f in os.listdir(indexName): os.remove(os.path.join(indexName, f)) # Remove results of previous runs os.rmdir(indexName) # Create a new Index index = Index(indexName, create=True) index.setMergeFactor(20) # Get the files files = os.listdir(filedir) for name in files: f = os.path.join(filedir, name) if os.path.isdir(f) or os.path.islink(f): continue text = open(f, 'rb').read().decode("latin-1") title = text.split('\n\n\n')[0] print 'indexing:', f # the next line creates a Document with 2 fields # one field is named text and the other is named # filename. The latter is created as Keyword since # the name is preceded by '_'. Naughty but expdient. index.index(text=text, __title=title, _filename=f) # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. #index.optimize() queries = [ 'fox', u'intô', 'python', 'fox python', '"the Fox and the"', 'the fox and python' ] for q in queries: hits = index.find(q) print q.encode('utf8'), hits for h in hits: print '\tFound in %s (%s)' % (h.get('filename'), h.get('title')) index.close() print 'Elapsed time:', time.time() - tt
def main(): """An example of using the Indexer wrapper. """ # TODO Command line argument passing # TODO e.g. # TODO -d directory to store index in # TODO -i directory to recursively index import time tt = time.time() filedir = 'aesop' indexName = 'aesopind' if os.path.exists(indexName): for f in os.listdir(indexName): os.remove(os.path.join(indexName, f)) # Remove results of previous runs os.rmdir(indexName) # Create a new Index index = Index(indexName, create = True) index.setMergeFactor(20) # Get the files files = os.listdir(filedir) for name in files: f = os.path.join(filedir, name) if os.path.isdir(f) or os.path.islink(f): continue text = open(f, 'rb').read().decode("latin-1") title = text.split('\n\n\n')[0] print 'indexing:', f # the next line creates a Document with 2 fields # one field is named text and the other is named # filename. The latter is created as Keyword since # the name is preceded by '_'. Naughty but expdient. index.index(text=text, __title=title, _filename=f) # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. #index.optimize() queries = ['fox', u'intô', 'python', 'fox python', '"the Fox and the"', 'the fox and python'] for q in queries: hits = index.find(q) print q.encode('utf8'), hits for h in hits: print '\tFound in %s (%s)' % (h.get('filename'), h.get('title')) index.close() print 'Elapsed time:', time.time() - tt
def search(self, results, user, path, query, sort, start, end): """ Perform a search in the 'text' field of each searchable item. If the search string is enclosed in double quotes, a phrase search will be run; otherwise, the search will be for documents containing all words specified. This lupy implementation ignores the sort parameter, and always sorts by relevance. """ index = Index(self._lupy_index_dir, False) hits = index.findInField(text=query) numhits = len(hits) # lupy is totally brain-dead, as it return the hits in reverse order # (least relevant first, so we have to retrieve *all* the hits # and work our way backwards) # also, hits.doc() has an obvious < vs. <= error that makes # hits.doc() return a index out of range error, so i just call # hits.getMoreDocs() to get every hit hits.getMoreDocs(numhits) # let's go through each hit in reverse order and assemble our # list of search results. skipped = 0 numhits_accessible = 0 for x in range(numhits-1, -1, -1): d = hits.doc(x) if not self._can_read(d, user, path): continue # we got a good one, so tally it up numhits_accessible += 1 # create a SearchResult object and append it to the end if skipped < start-1: skipped += 1 else: if len(results) < (end - start + 1): sr = SearchResult(d, hits.score(x)) results.append(sr) return numhits_accessible
# import lucene # from lupyne import engine # don't forget to call lucene.initVM # from lupyne.engine.indexers import Indexer # indexer = Indexer() # create an in-memory index (no filename supplied) # indexer.set('name', stored=True) # create stored 'name' field # indexer.set('text') # create indexed 'text' field (the default) # indexer.add(name='sample', text='hello world') # add a document to the index # indexer.commit() # commit changes; document is now searchable # hits = indexer.search('text:hello') # run search and return sequence of documents # len(hits), hits.count # 1 hit retrieved (out of a total of 1) # hit, = hits # hit['name'] # hits support mapping interface for their stored fields # u'sample' # print(hit.id, hit.score) # plus internal doc number and score # # (0, 0.19178301095962524) # print(hit.dict()) # import lucene from lupy.indexer import Index # we create index named "foobar", create True = overwrite existing index = Index('foobar', create=True)
# General Public License as published by the Free Software Foundation. import os, sys import email, email.Iterators, email.Errors import time from lupy.indexer import Index filedir = sys.argv[1] indexName = 'emailindex' tt = time.time() # Create a new Index index = Index(indexName, create=True) # Get the files files = os.listdir(filedir) i=0 for f in files: print 'Indexing', f fp = open(os.path.join(filedir, f)) # Try to parse the message try: msg = email.message_from_file(fp) except email.Errors.MessageParseError: print 'Bad msg:', f continue