Exemplo n.º 1
0
def index_document(docid, document, catalog=None):
    """Index a document to the current application catalog"""
    manager = ConnectionManager()
    if catalog is None:
        catalog = get_repozecatalog()
    catalog.index_doc(docid, document)
    manager.commit()
Exemplo n.º 2
0
def initialize_catalog():
    global _initialized
    if not _initialized:
        # create a catalog
        manager = ConnectionManager()
        catalog = factory(manager)
        # set up indexes
        catalog['flavors'] = CatalogFieldIndex(get_flavor)
        catalog['text'] = CatalogTextIndex(get_text)
        # commit the indexes
        manager.commit()
        manager.close()
        _initialized = True
Exemplo n.º 3
0
def prep_catalog():
    """Download python mailing list, create new catalog and catalog 
       messages, if not done already.
    """
    if not os.path.exists(BENCHMARK_DATA_DIR):
        os.makedirs(BENCHMARK_DATA_DIR)

    # Check to see if mailing list data already present
    if len(get_mailbox_filenames()) == 0:
        MailListSucker(MAILLIST_INDEX, BENCHMARK_DATA_DIR).suck()

    # Create ZODB and index maillist messages, if not yet done
    zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb')
    if not os.path.exists(zodb_file):
        # Create a catalog
        manager = ConnectionManager()
        factory = FileStorageCatalogFactory(
            os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark')
        c = factory(manager)

        # Create some indices
        c['subject'] = CatalogFieldIndex(get_subject)
        c['date'] = CatalogFieldIndex(get_date)
        c['sender_email'] = CatalogFieldIndex(get_sender_email)
        c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy)
        c['text'] = CatalogTextIndex(get_text)
        manager.commit()

        # Loop over messages to get base line
        profiler.start("Loop over messages without indexing")
        for _ in MessageIterator():
            pass
        profiler.stop("Loop over messages without indexing")

        profiler.start("Index messages")
        id = 1
        for msg in MessageIterator():
            c.index_doc(id, msg)
            id += 1
            if id / 100 == 0:
                manager.commit()
        manager.commit()
        manager.close()

        profiler.stop("Index messages")
        print "Indexed %d messages" % id
Exemplo n.º 4
0
def setup_repozecatalog(app, default_dbpath='repozecatalog.db',
                             default_dbname='catalog'):
    """Set up full text searching with repoze.catalog"""
    # if its not an absolute path, make it relative to the instance dir
    if not os.path.isabs(default_dbpath):
        default_dbpath = os.path.join(app.instance_dir, default_dbpath)
    app.add_config_var(DBPATH_CONF, str, default_dbpath)
    app.add_config_var(DBNAME_CONF, str, default_dbname)

    manager = ConnectionManager()
    catalog_factory = FileStorageCatalogFactory(
        app.cfg[DBPATH_CONF], app.cfg[DBNAME_CONF])
    catalog = catalog_factory()
    app.repozecatalog = catalog
    manager.commit()

    emit_event('repozecatalog-installed', catalog)
Exemplo n.º 5
0
def run():
    # Download mailbox archive of python mailing list and build
    # catalog if needed
    prep_catalog()

    # Open a catalog
    manager = ConnectionManager()
    factory = FileStorageCatalogFactory(
        os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark')
    c = factory(manager)

    # Do some searches

    profiler.start("unsorted retrieval")
    n, results = c.search(date=('0', 'Z'))
    print '%d results ' % n
    # Force generator to marshall brains
    for result in results:
        pass
    profiler.stop("unsorted retrieval")

    profiler.start("repeat unsorted retrieval")
    n, results = c.search(date=('0', 'Z'))
    print '%d results ' % n
    # Force generator to marshall brains
    for result in results:
        pass
    profiler.stop("repeat unsorted retrieval")

    profiler.start("sorted retrieval")
    n, results = c.search(date=('0', 'Z'), sort_index='subject')
    print '%d results ' % n
    for result in results:
        pass
    profiler.stop("sorted retrieval")

    profiler.start("reverse sorted retrieval")
    n, results = c.search(date=('0', 'Z'), sort_index='subject', reverse=True)
    print '%d results ' % n
    for result in results:
        pass
    profiler.stop("reverse sorted retrieval")

    profiler.start('limit to topic=year:2000')
    n, results = c.search(topics=['year:2000'])
    print '%d results' % n
    L = []
    for result in results:
        L.append(result)
    profiler.stop("limit to topic=year:2000")

    profiler.start('count limited to topic=year:2000')
    print c['topics'].counts(L, ['year:2000'])
    profiler.stop('count limited to topic=year:2000')

    profiler.stop()
    profiler.print_stack()
Exemplo n.º 6
0
def prep_catalog():
    """Download python mailing list, create new catalog and catalog 
       messages, if not done already.
    """
    if not os.path.exists(BENCHMARK_DATA_DIR):
        os.makedirs(BENCHMARK_DATA_DIR)
        
    # Check to see if mailing list data already present
    if len(get_mailbox_filenames()) == 0:
        MailListSucker(MAILLIST_INDEX,BENCHMARK_DATA_DIR).suck()
        
    # Create ZODB and index maillist messages, if not yet done
    zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb')
    if not os.path.exists(zodb_file):
        # Create a catalog
        manager = ConnectionManager()
        factory = FileStorageCatalogFactory(
            os.path.join(BENCHMARK_DATA_DIR,
                         'test.zodb'), 'benchmark' )
        c = factory(manager)
        
        # Create some indices
        c['subject'] = CatalogFieldIndex(get_subject)
        c['date'] = CatalogFieldIndex(get_date)
        c['sender_email'] = CatalogFieldIndex(get_sender_email)
        c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy)
        c['text'] = CatalogTextIndex(get_text)
        manager.commit()
                
        # Loop over messages to get base line
        profiler.start( "Loop over messages without indexing" )
        for _ in MessageIterator():
            pass
        profiler.stop( "Loop over messages without indexing" )
        
        profiler.start( "Index messages" )
        id = 1
        for msg in MessageIterator():
            c.index_doc(id,msg)
            id += 1
            if id / 100 == 0:
                manager.commit()
        manager.commit()
        manager.close()
        
        profiler.stop( "Index messages" )
        print("Indexed %d messages" % id)
Exemplo n.º 7
0
def initialize_catalog():
    global _initialized
    if not _initialized:
        # create a catalog
        manager = ConnectionManager()
        catalog = factory(manager)
        # set up indexes
        catalog['flavors'] = CatalogFieldIndex('flavor')
        catalog['texts'] = CatalogTextIndex('text')
        # commit the indexes
        manager.commit()
        manager.close()
        _initialized = True
Exemplo n.º 8
0
def do_benchmark(fname, nd, nk1, nk2, out=sys.stdout):
    cumulative1 = 0.0
    cumulative2 = 0.0

    print >>out, "Index 1:"
    print >>out, "\t# docs: %d" % nd
    print >>out, "\t# distinct keys: %d" % nk1
    print >>out, "Index 2:"
    print >>out, "\t# docs: %d" % nd
    print >>out, "\t# distinct keys: %d" % nk2
    print >>out, ""

    cost1, cost2 = predictions(nd, nk1, nk2)

    print >>out, 'Cost1: %0.2f' % cost1
    print >>out, 'Cost2: %0.2f' % cost2
    print >>out
    print >>out, "Prediction:"
    if cost1 > cost2:
        print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % (
            cost1/cost2)
    else:
        print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % (
            cost2/cost1)

    print >>out, ""
    print >>out, "Setting up indexes..."
    for fn in glob.glob(fname + "*"):
        os.remove(fn)

    manager = ConnectionManager()
    factory = FileStorageCatalogFactory(fname, 'intersection')
    catalog = factory(manager)

    catalog['one'] = CatalogFieldIndex('one')
    catalog['two'] = CatalogFieldIndex('two')

    class Document(object):
        def __init__(self, docid):
            self.one = str(docid % nk1)
            self.two = str(docid % nk2)

    for docid in range(nd):
        catalog.index_doc(docid, Document(docid))
    manager.commit()
    manager.close()

    N_QUERIES = 1000
    print >>out, "Running %d queries for each algorithm..." % N_QUERIES
    catalog = factory(manager)
    for _ in range(1000):
        key1 = random.randrange(nk1)
        key2 = random.randrange(nk2)
        query1 = Intersection1(Eq('one', str(key1)), Eq('two', str(key2)))
        query2 = Intersection2(Eq('one', str(key1)), Eq('two', str(key2)))

        start = time.time()
        result1 = query1.apply(catalog)
        cumulative1 += time.time() - start

        start = time.time()
        result2 = query2.apply(catalog)
        cumulative2 += time.time() - start

        s1 = sorted(list(result1))
        s2 = sorted(list(result2))

        assert s1==s2, (s1, s2)

    manager.close()
    for fn in glob.glob(fname + "*"):
        os.remove(fn)

    print >>out, ""
    print >>out, "Result:"
    print >>out, "Time for algorithm1: %0.3f s" % cumulative1
    print >>out, "Time for algorithm2: %0.3f s" % cumulative2
    if cumulative1 > cumulative2:
        print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % (
            cumulative1/cumulative2)
    else:
        print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % (
            cumulative2/cumulative1)
    return cost1 / cost2, cumulative1 / cumulative2
Exemplo n.º 9
0
def get_text(object, default):
    return getattr(object, 'thetext', default)

_initialized = False

def initialize_catalog():
    global _initialized
    if not _initialized:
        # create a catalog
        manager = ConnectionManager()
        catalog = factory(manager)
        # set up indexes
        catalog['flavors'] = CatalogFieldIndex(get_flavor)
        catalog['text'] = CatalogTextIndex(get_text)
        # commit the indexes
        manager.commit()
        manager.close()
        _initialized = True

if __name__ == '__main__':
    initialize_catalog()
    manager = ConnectionManager()
    catalog = factory(manager)
    content = {
         1:Content('peach', 'i am so very very peachy'),
         2:Content('pistachio', 'i am nutty'),
         }
    for docid, doc in content.items():
        catalog.index_doc(docid, doc)
    manager.commit()
Exemplo n.º 10
0
    if not _initialized:
        # create a catalog
        manager = ConnectionManager()
        catalog = factory(manager)
        # set up indexes
        catalog['flavors'] = CatalogFieldIndex('flavor')
        catalog['texts'] = CatalogTextIndex('text')
        # commit the indexes
        manager.commit()
        manager.close()
        _initialized = True


class Content(object):
    def __init__(self, flavor, text):
        self.flavor = flavor
        self.text = text


if __name__ == '__main__':
    initialize_catalog()
    manager = ConnectionManager()
    catalog = factory(manager)
    content = {
        1: Content('peach', 'i am so very very peachy'),
        2: Content('pistachio', 'i am nutty'),
    }
    for docid, doc in content.items():
        catalog.index_doc(docid, doc)
    manager.commit()