Exemplo n.º 1
0
    def getIndexWriter(self):

        writer = IndexWriter(
            DbDirectory(self.store.txn, self._db, self.store._blocks._db,
                        self._flags), StandardAnalyzer(), False)
        writer.setUseCompoundFile(False)

        return writer
Exemplo n.º 2
0
    def open(self, name, txn, **kwds):

        super(IndexContainer, self).open(name, txn, **kwds)

        if kwds.get('create', False):
            directory = DbDirectory(txn, self._db, self.store._blocks._db,
                                    self._flags)
            indexWriter = IndexWriter(directory, StandardAnalyzer(), True)
            indexWriter.close()
Exemplo n.º 3
0
 def __init__(self, directory, shaManager, app='futil'):
     self._directory = directory
     create = not IndexReader.indexExists(self._directory)
     self._writer = IndexWriter(self._directory, StandardAnalyzer(), create)
     self.shaBBDD = shaManager
     self.logger = FutilLogger(app)
     self.uriLoader = UriLoader(logger=self.logger)
     self.resetCounter()
Exemplo n.º 4
0
class IndexAppService(Indexer):

    def __init__(self, directory, shaManager, app='futil'):
        self._directory = directory
        create = not IndexReader.indexExists(self._directory)
        self._writer = IndexWriter(self._directory, StandardAnalyzer(), create)
        self.shaBBDD = shaManager
        self.logger = FutilLogger(app)
        self.uriLoader = UriLoader(logger=self.logger)
        self.resetCounter()

    def resetCounter(self):
        self.counter = 1000

    def countInsertion(self):
        self.counter -= 1
        if self.counter == 0:
            self.resetCounter()
            self._writer.optimize()


    def indexFOAF(self, foaf):
        document = FoafDocumentFactory.getDocumentFromFOAF(foaf)
        self._writer.addDocument(document)
        self.countInsertion()
        if ( foaf.has_key('sha')):
            for sha in foaf['sha']:
                self.shaBBDD.insertUriSha(foaf['uri'][0], sha)

        if ( foaf.has_key('friends')):
            for friendSha, friendUri in filter( lambda x: x[0] != '', foaf['friends']):
                self.shaBBDD.insertUriSha(friendUri, friendSha)
            return [u for (v,u) in foaf['friends']]
        return []

    def indexFOAFUri(self, foafUri):
        try:
            f = self.uriLoader.getFoafFrom(foafUri)
            return self.indexFOAF(f)
        except:
            self.logger.info("Unknow error indexing " + foafUri)
            return []


    def close(self):
        if self._writer:
            self._writer.close()
        self._writer = None
        self.shaBBDD.close()
Exemplo n.º 5
0
        try:
            doc = Document()
            doc.add(Field("span_id", span_id,
                          Field.Store.YES, Field.Index.UN_TOKENIZED))
            doc.add(Field("pmid", pmid,
                          Field.Store.YES, Field.Index.UN_TOKENIZED))
            doc.add(Field("text", span_text,
                          Field.Store.YES, Field.Index.TOKENIZED))
            addAnnotations(doc, span_id)
            writer.addDocument(doc)
        except Exception, e:
            sys.stderr.write("error: %s pmid: %s span_id: %s\n" % (e, pmid, span_id))
        i += 2

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print "Usage: python index_spans.py data_norm index_dir annotation_files"
    else:
        (data_norm, index_dir, annotation_files) = \
            (sys.argv[1], sys.argv[2], sys.argv[3:])
        print "Loading annotations ..."
        load(annotation_files)
        print "Making the index ..."
        writer = IndexWriter(index_dir, StandardAnalyzer(), True)
        writer.setMaxFieldLength(7 * 1000 * 1000 * 10)
        indexData(data_norm)
        print "Optimizing index ..."
        writer.optimize()
        print "Indexing complete"
        writer.close()
#import PyLucene
from mailbox import UnixMailbox
from PyLucene import StandardAnalyzer, FSDirectory, IndexWriter
from email_loader import EmailDoc
import os,sys,datetime,email,config

# determine when (if) the last update was
lastUp = datetime.datetime(2000, 1, 1)
createNewIndex = True

quiet = False
if len(sys.argv) > 1 and sys.argv[1] in ('-q', '--quiet'):
  quiet = True

store = FSDirectory.getDirectory( config.DB_PATH, True )
writer = IndexWriter( store, StandardAnalyzer(), True )

"""
mailbox = UnixMailbox( open('chipy.mbox') )
while True:
    msg = mailbox.next()
    if msg == None: break
    writer.addDocument( EmailDoc(msg) )
"""

source=config.MAILDIR_ROOT_DIR

for root, dirs, files in os.walk(source):
    if not quiet:
      sys.stdout.write('\nindexing files in %s' % root)
      sys.stdout.flush()
Exemplo n.º 7
0
#!/usr/bin/env python2.4

from mailbox import UnixMailbox
from PyLucene import StandardAnalyzer, FSDirectory, IndexWriter
from email import EmailDoc

store = FSDirectory.getDirectory('chipy-index', True)
writer = IndexWriter(store, StandardAnalyzer(), True)

mailbox = UnixMailbox(open('chipy.mbox'))
while True:
    msg = mailbox.next()
    if msg == None: break
    writer.addDocument(EmailDoc(msg))

writer.close()
Exemplo n.º 8
0
#!/usr/bin/env python2.4

from mailbox import UnixMailbox
from PyLucene import StandardAnalyzer, FSDirectory, IndexWriter
from email import EmailDoc

store = FSDirectory.getDirectory( 'chipy-index', True )
writer = IndexWriter( store, StandardAnalyzer(), True )

mailbox = UnixMailbox( open('chipy.mbox') )
while True:
    msg = mailbox.next()
    if msg == None: break
    writer.addDocument( EmailDoc(msg) )

writer.close()
Exemplo n.º 9
0
                      Field.Index.UN_TOKENIZED))
            doc.add(
                Field("pmid", pmid, Field.Store.YES, Field.Index.UN_TOKENIZED))
            doc.add(
                Field("text", span_text, Field.Store.YES,
                      Field.Index.TOKENIZED))
            addAnnotations(doc, span_id)
            writer.addDocument(doc)
        except Exception, e:
            sys.stderr.write("error: %s pmid: %s span_id: %s\n" %
                             (e, pmid, span_id))
        i += 2


if __name__ == '__main__':
    if len(sys.argv) == 1:
        print "Usage: python index_spans.py data_norm index_dir annotation_files"
    else:
        (data_norm, index_dir, annotation_files) = \
            (sys.argv[1], sys.argv[2], sys.argv[3:])
        print "Loading annotations ..."
        load(annotation_files)
        print "Making the index ..."
        writer = IndexWriter(index_dir, StandardAnalyzer(), True)
        writer.setMaxFieldLength(7 * 1000 * 1000 * 10)
        indexData(data_norm)
        print "Optimizing index ..."
        writer.optimize()
        print "Indexing complete"
        writer.close()