예제 #1
0
 def _setupIndexer(self):
     if self.searcher is not None:
         self.searcher.close()
         self.searcher = None
     if self.indexer is None:
         self.indexer = IndexWriter(self.name, False, self.analyzer)
         self.indexer.mergeFactor = self.mergeFactor
예제 #2
0
파일: indexer.py 프로젝트: Atom66/tain335
 def _setupIndexer(self):
     if self.searcher is not None:
         self.searcher.close()
         self.searcher = None
     if self.indexer is None:
         self.indexer = IndexWriter(self.name, False, self.analyzer)
         self.indexer.mergeFactor = self.mergeFactor
예제 #3
0
    def __init__(self, name, create=False, analyzer=None):
        """
        @param name: Name of the directory for this index.
        @param create: Whether to create this directory or not.
        @type create: boolean
        """

        self.name = name
        self.analyzer = analyzer or standardTokenizer
        # Create the index if we need to. From here on we assume
        # that the index exists
        self.indexer = IndexWriter(self.name, create, analyzer)
        # Remember the default merge factor
        self.mergeFactor = self.indexer.mergeFactor
        # Clean up
        self.indexer.close()
        self.indexer = self.searcher = None
예제 #4
0
파일: indexer.py 프로젝트: Atom66/tain335
 def __init__(self, name, create=False, analyzer=None):
     """
     @param name: Name of the directory for this index.
     @param create: Whether to create this directory or not.
     @type create: boolean
     """
     
     self.name = name
     self.analyzer = analyzer or standardTokenizer
     # Create the index if we need to. From here on we assume
     # that the index exists
     self.indexer = IndexWriter(self.name, create, analyzer)
     # Remember the default merge factor
     self.mergeFactor = self.indexer.mergeFactor
     # Clean up
     self.indexer.close()
     self.indexer = self.searcher = None
예제 #5
0
def index(fileName, limit):
    startTiming()
    # create a new index in a directory
    indexer = IndexWriter(g_indexPath, True)

    # supposed to speed up indexing by avoiding disk i/o
    # that's how many documents to index in memory before flushing
    # to disk
    indexer.mergeFactor = 1100

    count = 0
    failedCount = 0
    for (title, ns, txt) in iterGetArticle(fileName):
        if ns != 0:
            continue
        try:
            # iso-8859-1 is the default encoding used in Wikipedia
            txt = txt.decode("iso-8859-1")
            title = title.decode("iso-8859-1")
            indexOneArticle(indexer, title, txt)
            count += 1
            if count >= limit:
                break
            if count % 500 == 0:
                print "indexed %d articles" % count
        except:
            # for now just ignore possible decoding errors
            if g_fVerbose:
                sys.stdout.write('failed to index %s' % title)
            failedCount += 1

    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.

    indexer.optimize()
    indexer.close()
    endTiming()

    print "indexed %d articles" % count
    print "failed to index %d articles" % failedCount
    dumpTiming()
예제 #6
0
    def commit_documents(self):
        # for lupy, put everyting in a single index because lupy
        # doesn't support multiple index searching
        writer = IndexWriter(self._lupy_index_dir, False)
        
        while len(self._lupy_queue) > 0:
            qd = self._lupy_queue.pop(0)
            fields = qd[0]
            existing = qd[1]

            """ add a document with the given fields to the index.
            If it's a document that already exists, delete it first, using the url as the key.
            """        
            if existing:
                # commit anything that's already been added by closing the writer
                writer.close()

                _delete_document(self._get_field_value(fields, 'url'))                

                # reopen writer
                writer = IndexWriter(self._lupy_index_dir, False)            

            # create a lupy document consisting of the fields
            d = Document()
            for x in fields:
                # converting to unicode keeps lupy happy, but then when we get the previews back
                #  we get an error when printing, so let's just punt and remove all bad characters
                # value = self._convert_text_from_iso8559_to_unicode(str(x[1]))
                value = self._remove_bad_characters(x[1])
               
                f = Field(str(x[0]), value, x[2], x[3], x[4])
                d.add(f)

            try:
                writer.addDocument(d)
            except:
                print "Lupy: could not add document: %s" % (self._get_field_value(fields, 'text')) 
                
        writer.close()               
예제 #7
0
파일: testLupy.py 프로젝트: kjk/ipedia-palm
def index(fileName,limit):
    startTiming()
    # create a new index in a directory
    indexer = IndexWriter(g_indexPath, True)

    # supposed to speed up indexing by avoiding disk i/o
    # that's how many documents to index in memory before flushing
    # to disk
    indexer.mergeFactor = 1100

    count = 0
    failedCount = 0
    for (title,ns,txt) in iterGetArticle(fileName):
        if ns != 0:
            continue
        try:
            # iso-8859-1 is the default encoding used in Wikipedia
            txt = txt.decode("iso-8859-1")
            title = title.decode("iso-8859-1")
            indexOneArticle(indexer,title,txt)
            count += 1
            if count >= limit:
                break
            if count % 500 == 0:
                print "indexed %d articles" % count
        except:
            # for now just ignore possible decoding errors
            if g_fVerbose:
                sys.stdout.write('failed to index %s' % title)
            failedCount += 1

    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.

    indexer.optimize()        
    indexer.close()
    endTiming()

    print "indexed %d articles" % count
    print "failed to index %d articles" % failedCount
    dumpTiming()
예제 #8
0
파일: index.py 프로젝트: Atom66/tain335
 def __init__(self, path, create=False):
     """Create an indexer, writing and index to the directory B{path}.
     The boolean flag B{create} determines whether the index is created
     (overwriting an existing index) or updated"""
     
     self.indexer = IndexWriter(path, create)
예제 #9
0
파일: index.py 프로젝트: Atom66/tain335
class Indexer:


    def __init__(self, path, create=False):
        """Create an indexer, writing and index to the directory B{path}.
        The boolean flag B{create} determines whether the index is created
        (overwriting an existing index) or updated"""
        
        self.indexer = IndexWriter(path, create)
        

    def addDoc(self, fname):
        """Add a document to the index"""
        
        # create document
        d = document.Document()

        # add a file field containing the path to this file
        f = document.Keyword('filename',fname)
        d.add(f)

        # I happen to know that the title is separated
        # from the story by '\n\n\n', so I can easily get the title
        # which we store in the title field
        fp = open(fname,'rb')
        s = fp.read().decode("latin-1")
        title = s.split('\n\n\n')[0]
        f = document.Text('title',title)
        d.add(f)

        # Here I pass False as the 3rd arg to ensure that
        # the actual text of s is not stored in the index
        # the following lines using TextWithReader are
        # more typical.
        
        f = document.Text('text', s, False)
        d.add(f)

        
        # Add text of an open file (fp)
        # This is typically how you add a file to an index
        # f = field.Text('text', fp)
        # d.add(f)
        
        fp.close()

        # add doc to index
        print 'indexing', fname
        self.indexer.addDocument(d)


    def index(self, dir):
        """Recurse through B{dir} and index the files.
        
        Call optimize() before closing to merge all of the segments
        created by indexing. This is an optional step and can be expensive
        for large indexes.
        """
        for name in os.listdir(dir):
            f = os.path.join(dir, name)
            if os.path.isdir(f) or os.path.islink(f):
                continue
            self.addDoc(f)

        # Uncomment the following line to optimize the index.
        # Have a look in the index dir before you optimize.
        # You will probably see a dozens of files from
        # several segments. optimize() merges all the segments
        # into one. It can be quite an expensive operation, but
        # it can save space and speed up searches.
        
        # self.indexer.optimize()
        
        self.indexer.close()
예제 #10
0
파일: indexer.py 프로젝트: Atom66/tain335
class Index:

    def __init__(self, name, create=False, analyzer=None):
        """
        @param name: Name of the directory for this index.
        @param create: Whether to create this directory or not.
        @type create: boolean
        """
        
        self.name = name
        self.analyzer = analyzer or standardTokenizer
        # Create the index if we need to. From here on we assume
        # that the index exists
        self.indexer = IndexWriter(self.name, create, analyzer)
        # Remember the default merge factor
        self.mergeFactor = self.indexer.mergeFactor
        # Clean up
        self.indexer.close()
        self.indexer = self.searcher = None
        
    def index(self, **kw):
        """Add a document to the index.
        
        **kw contains the name and values of each Field in the
        Document that we are creating.

        If the key in **kw starts with '_' the field will be created
        as a Keyword. If it starts with '__', it is created as a
        stored Text field (e.g. tokenized and stored), otherwise it
        will be created as a Text field. The leading '_' are removed
        before field creation.

        Text fields will have their value tokenized before
        indexing. The value is not stored in the index.  This is the
        usual type of field that you need for plain text.

        Keyword fields will not have their value tokenized.  The value
        is stored in the index and is returned with search hits on the
        Document. If you wanted to store the path to a document along
        with each document, you would use a Keyword field. The path
        would not be tokenized and its value would be returned in the
        query results, so you could easily open and display the file.
        """
        self._setupIndexer()
        
        # create document
        d = document.Document()

        # TODO - Please find another way of defining fields
        # than magic field names!!!

        # add a file field containing the path to this file
        for key, value in kw.items():
            if key[:2] == '__':
                key = key[2:]
                # Tokenized and stored
                f = document.Text(key, value, True)
            elif key[0] == '_':
                # Not tokenized and stored
                key = key[1:]
                # keyword
                f = document.Keyword(key, value)
            else:
                # Tokenized and not stored
                f = document.Text(key, value, False)
            d.add(f)
        self.indexer.addDocument(d)

    def _setupIndexer(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is None:
            self.indexer = IndexWriter(self.name, False, self.analyzer)
            self.indexer.mergeFactor = self.mergeFactor

    def _setupSearcher(self):
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None
        if self.searcher is None:
            self.searcher = indexsearcher.IndexSearcher(self.name)

    def delete(self, **kw):
        "Delete the first document containing the specified term. See also L{deleteAll}."
        # Not very efficient for bulk deletes
        # Use deleteAll for bulk deletes
        self._setupSearcher()
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, value = kw.items()[0]
        t = Term(field, value)
        self.searcher.reader.deleteTerm(t)
        
    def deleteAll(self, **kw):
        "Remove all documents containing this field and value."
        self.close()
        reader = indexsearcher.open(self.name)
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, values = kw.items()[0]
        for value in values:
            t = Term(field, value)
            reader.deleteTerm(t)
        # commit the deletes
        reader.close()

    def close(self):
        # Indexer and Searchers are different
        # and we have to open the right kind
        # for the operation we are performing.
        # The actual creation is done in the index and find
        # methods. Here we close whatever is open.
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None

    def flush(self):
       """Flush outstanding indexes to disk.

       This makes sure we are searching the latest stuff.
       """
       if self.indexer is not None:
           self.indexer.flushRamSegments()

    def optimize(self):
        """Merge all on-disk segments into a single segment. Saves space and can speed up queries."""
        self._setupIndexer()
        self.indexer.optimize()

    def parse(self, field, qString):
        if qString.startswith('"'):
            qString = qString.strip('"')
            #qWords = qString.strip('"').split()
            qWords = self._tokenize(qString)
            return self.phraseSearch(field, qWords)
        else:
            qWords = self._tokenize(qString)
            if len(qWords) == 1:
                return self.termSearch(field, qWords[0])
            else:
                return self.boolSearch(field, qWords)

    def _tokenize(self, qString):
        return list(self.analyzer(qString))

    def find(self, qStr):
        """Perform a search in any field in this index.

        If the search string is enclosed in double quotes, a phrase
        search will be run; otherwise, the search will be for
        documents containing all words specified."""
        
        self._setupSearcher()
            
        fields = self.searcher.fieldNames()
        if not fields:
            return []
        all = [self.parse(field, qStr) for field in fields]
        if len(all) is 1:
            # simple case
            return self.searcher.search(all[0])
        
        q = BooleanQuery()
        for query in all:
            # OR all of the field queries
            q.add(query, False, False)
        hits = self.searcher.search(q)
        return hits

    def findInField(self, **kw):
        """Search only in a single field."""
        # eg index.findInField(text='flute')
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        self._setupSearcher()
        field, query = kw.items()[0]
        q = self.parse(field, query)
        hits = self.searcher.search(q)
        return hits
    
    def termSearch(self, field, term):
        "Search for a single C{term} in a C{field}."
        t = Term(field, term)
        q = TermQuery(t)
        return q

    def phraseSearch(self, field, words):
        "Search for a phrase (given as a list of words) in C{field}."
        q = PhraseQuery()
        for word in words:
            t = Term(field, word)
            q.add(t)  
        return q
            
    def boolSearch(self, field, ands=[], ors=[], nots=[]):
        """Build a simple boolean query.

        Each word in C{ands} is equiv to +word
        Each word in C{ors} is equiv to word
        Each word in C{nots} is equiv to -word

        E.g. C{boolSearch(['spam'], ['eggs'], ['parrot', 'cheese'])} is
        equiv to C{+spam eggs -parrot -cheese} in Google/Lucene syntax.
        """
        q = BooleanQuery()

        for a in ands:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, True, False)
            
        for a in ors:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, False)
            
        for a in nots:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, True)
        
        return q
            
    def printHits(self, hits):
        if len(hits) == 0:
            print 'Nothing found!'
        else:
            for i in range(len(hits)):
                print hits.doc(i), hits.score(i)

    def setMergeFactor(self, anInt):
        "Set how many documents will be processed before the indexes will be merged. Never less than 2."
        # Never less than 2
        if anInt >= 2:
            self.mergeFactor = anInt
예제 #11
0
 def _optimize_index(self):
     index = IndexWriter(self._lupy_index_dir, False)
     index.optimize()
     index.close()
예제 #12
0
class Index:
    def __init__(self, name, create=False, analyzer=None):
        """
        @param name: Name of the directory for this index.
        @param create: Whether to create this directory or not.
        @type create: boolean
        """

        self.name = name
        self.analyzer = analyzer or standardTokenizer
        # Create the index if we need to. From here on we assume
        # that the index exists
        self.indexer = IndexWriter(self.name, create, analyzer)
        # Remember the default merge factor
        self.mergeFactor = self.indexer.mergeFactor
        # Clean up
        self.indexer.close()
        self.indexer = self.searcher = None

    def index(self, **kw):
        """Add a document to the index.
        
        **kw contains the name and values of each Field in the
        Document that we are creating.

        If the key in **kw starts with '_' the field will be created
        as a Keyword. If it starts with '__', it is created as a
        stored Text field (e.g. tokenized and stored), otherwise it
        will be created as a Text field. The leading '_' are removed
        before field creation.

        Text fields will have their value tokenized before
        indexing. The value is not stored in the index.  This is the
        usual type of field that you need for plain text.

        Keyword fields will not have their value tokenized.  The value
        is stored in the index and is returned with search hits on the
        Document. If you wanted to store the path to a document along
        with each document, you would use a Keyword field. The path
        would not be tokenized and its value would be returned in the
        query results, so you could easily open and display the file.
        """
        self._setupIndexer()

        # create document
        d = document.Document()

        # TODO - Please find another way of defining fields
        # than magic field names!!!

        # add a file field containing the path to this file
        for key, value in kw.items():
            if key[:2] == '__':
                key = key[2:]
                # Tokenized and stored
                f = document.Text(key, value, True)
            elif key[0] == '_':
                # Not tokenized and stored
                key = key[1:]
                # keyword
                f = document.Keyword(key, value)
            else:
                # Tokenized and not stored
                f = document.Text(key, value, False)
            d.add(f)
        self.indexer.addDocument(d)

    def _setupIndexer(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is None:
            self.indexer = IndexWriter(self.name, False, self.analyzer)
            self.indexer.mergeFactor = self.mergeFactor

    def _setupSearcher(self):
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None
        if self.searcher is None:
            self.searcher = indexsearcher.IndexSearcher(self.name)

    def delete(self, **kw):
        "Delete the first document containing the specified term. See also L{deleteAll}."
        # Not very efficient for bulk deletes
        # Use deleteAll for bulk deletes
        self._setupSearcher()
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, value = kw.items()[0]
        t = Term(field, value)
        self.searcher.reader.deleteTerm(t)

    def deleteAll(self, **kw):
        "Remove all documents containing this field and value."
        self.close()
        reader = indexsearcher.open(self.name)
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, values = kw.items()[0]
        for value in values:
            t = Term(field, value)
            reader.deleteTerm(t)
        # commit the deletes
        reader.close()

    def close(self):
        # Indexer and Searchers are different
        # and we have to open the right kind
        # for the operation we are performing.
        # The actual creation is done in the index and find
        # methods. Here we close whatever is open.
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None

    def flush(self):
        """Flush outstanding indexes to disk.

       This makes sure we are searching the latest stuff.
       """
        if self.indexer is not None:
            self.indexer.flushRamSegments()

    def optimize(self):
        """Merge all on-disk segments into a single segment. Saves space and can speed up queries."""
        self._setupIndexer()
        self.indexer.optimize()

    def parse(self, field, qString):
        if qString.startswith('"'):
            qString = qString.strip('"')
            #qWords = qString.strip('"').split()
            qWords = self._tokenize(qString)
            return self.phraseSearch(field, qWords)
        else:
            qWords = self._tokenize(qString)
            if len(qWords) == 1:
                return self.termSearch(field, qWords[0])
            else:
                return self.boolSearch(field, qWords)

    def _tokenize(self, qString):
        return list(self.analyzer(qString))

    def find(self, qStr):
        """Perform a search in any field in this index.

        If the search string is enclosed in double quotes, a phrase
        search will be run; otherwise, the search will be for
        documents containing all words specified."""

        self._setupSearcher()

        fields = self.searcher.fieldNames()
        if not fields:
            return []
        all = [self.parse(field, qStr) for field in fields]
        if len(all) is 1:
            # simple case
            return self.searcher.search(all[0])

        q = BooleanQuery()
        for query in all:
            # OR all of the field queries
            q.add(query, False, False)
        hits = self.searcher.search(q)
        return hits

    def findInField(self, **kw):
        """Search only in a single field."""
        # eg index.findInField(text='flute')
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        self._setupSearcher()
        field, query = kw.items()[0]
        q = self.parse(field, query)
        hits = self.searcher.search(q)
        return hits

    def termSearch(self, field, term):
        "Search for a single C{term} in a C{field}."
        t = Term(field, term)
        q = TermQuery(t)
        return q

    def phraseSearch(self, field, words):
        "Search for a phrase (given as a list of words) in C{field}."
        q = PhraseQuery()
        for word in words:
            t = Term(field, word)
            q.add(t)
        return q

    def boolSearch(self, field, ands=[], ors=[], nots=[]):
        """Build a simple boolean query.

        Each word in C{ands} is equiv to +word
        Each word in C{ors} is equiv to word
        Each word in C{nots} is equiv to -word

        E.g. C{boolSearch(['spam'], ['eggs'], ['parrot', 'cheese'])} is
        equiv to C{+spam eggs -parrot -cheese} in Google/Lucene syntax.
        """
        q = BooleanQuery()

        for a in ands:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, True, False)

        for a in ors:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, False)

        for a in nots:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, True)

        return q

    def printHits(self, hits):
        if len(hits) == 0:
            print 'Nothing found!'
        else:
            for i in range(len(hits)):
                print hits.doc(i), hits.score(i)

    def setMergeFactor(self, anInt):
        "Set how many documents will be processed before the indexes will be merged. Never less than 2."
        # Never less than 2
        if anInt >= 2:
            self.mergeFactor = anInt
예제 #13
0
    def __init__(self, path, create=False):
        """Create an indexer, writing and index to the directory B{path}.
        The boolean flag B{create} determines whether the index is created
        (overwriting an existing index) or updated"""

        self.indexer = IndexWriter(path, create)
예제 #14
0
class Indexer:
    def __init__(self, path, create=False):
        """Create an indexer, writing and index to the directory B{path}.
        The boolean flag B{create} determines whether the index is created
        (overwriting an existing index) or updated"""

        self.indexer = IndexWriter(path, create)

    def addDoc(self, fname):
        """Add a document to the index"""

        # create document
        d = document.Document()

        # add a file field containing the path to this file
        f = document.Keyword('filename', fname)
        d.add(f)

        # I happen to know that the title is separated
        # from the story by '\n\n\n', so I can easily get the title
        # which we store in the title field
        fp = open(fname, 'rb')
        s = fp.read().decode("latin-1")
        title = s.split('\n\n\n')[0]
        f = document.Text('title', title)
        d.add(f)

        # Here I pass False as the 3rd arg to ensure that
        # the actual text of s is not stored in the index
        # the following lines using TextWithReader are
        # more typical.

        f = document.Text('text', s, False)
        d.add(f)

        # Add text of an open file (fp)
        # This is typically how you add a file to an index
        # f = field.Text('text', fp)
        # d.add(f)

        fp.close()

        # add doc to index
        print 'indexing', fname
        self.indexer.addDocument(d)

    def index(self, dir):
        """Recurse through B{dir} and index the files.
        
        Call optimize() before closing to merge all of the segments
        created by indexing. This is an optional step and can be expensive
        for large indexes.
        """
        for name in os.listdir(dir):
            f = os.path.join(dir, name)
            if os.path.isdir(f) or os.path.islink(f):
                continue
            self.addDoc(f)

        # Uncomment the following line to optimize the index.
        # Have a look in the index dir before you optimize.
        # You will probably see a dozens of files from
        # several segments. optimize() merges all the segments
        # into one. It can be quite an expensive operation, but
        # it can save space and speed up searches.

        # self.indexer.optimize()

        self.indexer.close()