Пример #1
0
def main():
    """An example of using the Indexer wrapper.
    """

    # TODO Command line argument passing
    # TODO e.g.
    # TODO -d directory to store index in
    # TODO -i directory to recursively index

    import time
    tt = time.time()

    filedir = 'aesop'
    indexName = 'aesopind'

    if os.path.exists(indexName):
        for f in os.listdir(indexName):
            os.remove(os.path.join(indexName, f))
        # Remove results of previous runs
        os.rmdir(indexName)

    # Create a new Index
    index = Index(indexName, create=True)
    index.setMergeFactor(20)
    # Get the files
    files = os.listdir(filedir)
    for name in files:
        f = os.path.join(filedir, name)
        if os.path.isdir(f) or os.path.islink(f):
            continue
        text = open(f, 'rb').read().decode("latin-1")
        title = text.split('\n\n\n')[0]
        print 'indexing:', f
        # the next line creates a Document with 2 fields
        # one field is named text and the other is named
        # filename. The latter is created as Keyword since
        # the name is preceded by '_'. Naughty but expdient.
        index.index(text=text, __title=title, _filename=f)

    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.

    #index.optimize()

    queries = [
        'fox', u'intô', 'python', 'fox python', '"the Fox and the"',
        'the fox and python'
    ]
    for q in queries:
        hits = index.find(q)
        print q.encode('utf8'), hits
        for h in hits:
            print '\tFound in %s (%s)' % (h.get('filename'), h.get('title'))
    index.close()
    print 'Elapsed time:', time.time() - tt
Пример #2
0
def main():
    """An example of using the Indexer wrapper.
    """
    
    # TODO Command line argument passing
    # TODO e.g.
    # TODO -d directory to store index in
    # TODO -i directory to recursively index
    
    import time
    tt = time.time()

    filedir = 'aesop'
    indexName = 'aesopind'

    if os.path.exists(indexName):
        for f in os.listdir(indexName):
            os.remove(os.path.join(indexName, f))
        # Remove results of previous runs
        os.rmdir(indexName)

    # Create a new Index
    index = Index(indexName, create = True)
    index.setMergeFactor(20)
    # Get the files
    files = os.listdir(filedir)
    for name in files:
        f = os.path.join(filedir, name)
        if os.path.isdir(f) or os.path.islink(f):
            continue
        text = open(f, 'rb').read().decode("latin-1")
        title = text.split('\n\n\n')[0]
        print 'indexing:', f
        # the next line creates a Document with 2 fields
        # one field is named text and the other is named
        # filename. The latter is created as Keyword since
        # the name is preceded by '_'. Naughty but expdient.
        index.index(text=text, __title=title, _filename=f)
        
    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.
    
    #index.optimize()

    queries = ['fox', u'intô', 'python', 'fox python',
               '"the Fox and the"',
               'the fox and python']
    for q in queries:
        hits = index.find(q)
        print q.encode('utf8'), hits
        for h in hits:
            print '\tFound in %s (%s)' % (h.get('filename'), h.get('title'))
    index.close()
    print 'Elapsed time:', time.time() - tt
Пример #3
0
    def search(self, results, user, path, query, sort, start, end):
        """
        Perform a search in the 'text' field of each searchable item.

        If the search string is enclosed in double quotes, a phrase
        search will be run; otherwise, the search will be for
        documents containing all words specified.

        This lupy implementation ignores the sort parameter, and
        always sorts by relevance.
        """
        index = Index(self._lupy_index_dir, False)    
        hits = index.findInField(text=query)
        numhits = len(hits)

        # lupy is totally brain-dead, as it return the hits in reverse order
        #  (least relevant first, so we have to retrieve *all* the hits
        #   and work our way backwards)
        # also, hits.doc() has an obvious < vs. <= error that makes
        #  hits.doc() return a index out of range error, so i just call
        #  hits.getMoreDocs() to get every hit
        hits.getMoreDocs(numhits)

        # let's go through each hit in reverse order and assemble our
        #  list of search results.
        skipped = 0
        numhits_accessible = 0       
        for x in range(numhits-1, -1, -1):       
            d = hits.doc(x)

            if not self._can_read(d, user, path):
                continue                 

            # we got a good one, so tally it up            
            numhits_accessible += 1            

            # create a SearchResult object and append it to the end
            if skipped < start-1:
                skipped += 1
            else:
                if len(results) < (end - start + 1):                
                    sr = SearchResult(d, hits.score(x))              
                    results.append(sr)      

        return numhits_accessible        
Пример #4
0
# import lucene
# from lupyne import engine   # don't forget to call lucene.initVM
# from lupyne.engine.indexers import Indexer
# indexer = Indexer()                      # create an in-memory index (no filename supplied)
# indexer.set('name', stored=True)                # create stored 'name' field
# indexer.set('text')                             # create indexed 'text' field (the default)
# indexer.add(name='sample', text='hello world')  # add a document to the index
# indexer.commit()                                # commit changes; document is now searchable
# hits = indexer.search('text:hello')             # run search and return sequence of documents
# len(hits), hits.count                           # 1 hit retrieved (out of a total of 1)
# hit, = hits
# hit['name']                                     # hits support mapping interface for their stored fields
# u'sample'
# print(hit.id, hit.score)                               # plus internal doc number and score
# # (0, 0.19178301095962524)
# print(hit.dict())

# import lucene

from lupy.indexer import Index
# we create index named "foobar", create True = overwrite existing
index = Index('foobar', create=True)
Пример #5
0
# General Public License as published by the Free Software Foundation.

import os, sys
import email, email.Iterators, email.Errors
import time

from lupy.indexer import Index


filedir = sys.argv[1]
indexName = 'emailindex'

tt = time.time()

# Create a new Index
index = Index(indexName, create=True)

# Get the files
files = os.listdir(filedir)

i=0
for f in files:
    print 'Indexing', f
    fp = open(os.path.join(filedir, f))

    # Try to parse the message
    try:
        msg = email.message_from_file(fp)
    except email.Errors.MessageParseError:
        print 'Bad msg:', f
        continue