Exemplo n.º 1
0
def prep_catalog():
    """Download python mailing list, create new catalog and catalog 
       messages, if not done already.
    """
    if not os.path.exists(BENCHMARK_DATA_DIR):
        os.makedirs(BENCHMARK_DATA_DIR)

    # Check to see if mailing list data already present
    if len(get_mailbox_filenames()) == 0:
        MailListSucker(MAILLIST_INDEX, BENCHMARK_DATA_DIR).suck()

    # Create ZODB and index maillist messages, if not yet done
    zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb')
    if not os.path.exists(zodb_file):
        # Create a catalog
        manager = ConnectionManager()
        factory = FileStorageCatalogFactory(
            os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark')
        c = factory(manager)

        # Create some indices
        c['subject'] = CatalogFieldIndex(get_subject)
        c['date'] = CatalogFieldIndex(get_date)
        c['sender_email'] = CatalogFieldIndex(get_sender_email)
        c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy)
        c['text'] = CatalogTextIndex(get_text)
        manager.commit()

        # Loop over messages to get base line
        profiler.start("Loop over messages without indexing")
        for _ in MessageIterator():
            pass
        profiler.stop("Loop over messages without indexing")

        profiler.start("Index messages")
        id = 1
        for msg in MessageIterator():
            c.index_doc(id, msg)
            id += 1
            if id / 100 == 0:
                manager.commit()
        manager.commit()
        manager.close()

        profiler.stop("Index messages")
        print("Indexed %d messages" % id)
Exemplo n.º 2
0
def run():
    # Download mailbox archive of python mailing list and build
    # catalog if needed
    prep_catalog()

    # Open a catalog
    manager = ConnectionManager()
    factory = FileStorageCatalogFactory(
        os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark')
    c = factory(manager)

    # Do some searches

    profiler.start("unsorted retrieval")
    n, results = c.search(date=('0', 'Z'))
    print('%d results ' % n)
    # Force generator to marshall brains
    for result in results:
        pass
    profiler.stop("unsorted retrieval")

    profiler.start("repeat unsorted retrieval")
    n, results = c.search(date=('0', 'Z'))
    print('%d results ' % n)
    # Force generator to marshall brains
    for result in results:
        pass
    profiler.stop("repeat unsorted retrieval")

    profiler.start("sorted retrieval")
    n, results = c.search(date=('0', 'Z'), sort_index='subject')
    print('%d results ' % n)
    for result in results:
        pass
    profiler.stop("sorted retrieval")

    profiler.start("reverse sorted retrieval")
    n, results = c.search(date=('0', 'Z'), sort_index='subject', reverse=True)
    print('%d results ' % n)
    for result in results:
        pass
    profiler.stop("reverse sorted retrieval")

    profiler.start('limit to topic=year:2000')
    n, results = c.search(topics=['year:2000'])
    print('%d results' % n)
    L = []
    for result in results:
        L.append(result)
    profiler.stop("limit to topic=year:2000")

    profiler.start('count limited to topic=year:2000')
    print(c['topics'].counts(L, ['year:2000']))
    profiler.stop('count limited to topic=year:2000')

    profiler.stop()
    profiler.print_stack()
Exemplo n.º 3
0
def prep_catalog():
    """Download python mailing list, create new catalog and catalog 
       messages, if not done already.
    """
    if not os.path.exists(BENCHMARK_DATA_DIR):
        os.makedirs(BENCHMARK_DATA_DIR)
        
    # Check to see if mailing list data already present
    if len(get_mailbox_filenames()) == 0:
        MailListSucker(MAILLIST_INDEX,BENCHMARK_DATA_DIR).suck()
        
    # Create ZODB and index maillist messages, if not yet done
    zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb')
    if not os.path.exists(zodb_file):
        # Create a catalog
        manager = ConnectionManager()
        factory = FileStorageCatalogFactory(
            os.path.join(BENCHMARK_DATA_DIR,
                         'test.zodb'), 'benchmark' )
        c = factory(manager)
        
        # Create some indices
        c['subject'] = CatalogFieldIndex(get_subject)
        c['date'] = CatalogFieldIndex(get_date)
        c['sender_email'] = CatalogFieldIndex(get_sender_email)
        c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy)
        c['text'] = CatalogTextIndex(get_text)
        manager.commit()
                
        # Loop over messages to get base line
        profiler.start( "Loop over messages without indexing" )
        for _ in MessageIterator():
            pass
        profiler.stop( "Loop over messages without indexing" )
        
        profiler.start( "Index messages" )
        id = 1
        for msg in MessageIterator():
            c.index_doc(id,msg)
            id += 1
            if id / 100 == 0:
                manager.commit()
        manager.commit()
        manager.close()
        
        profiler.stop( "Index messages" )
        print "Indexed %d messages" % id
Exemplo n.º 4
0
def do_benchmark(fname, nd, nk1, nk2, out=sys.stdout):
    cumulative1 = 0.0
    cumulative2 = 0.0

    print("Index 1:", file=out)
    print("\t# docs: %d" % nd, file=out)
    print("\t# distinct keys: %d" % nk1, file=out)
    print("Index 2:", file=out)
    print("\t# docs: %d" % nd, file=out)
    print("\t# distinct keys: %d" % nk2, file=out)
    print("", file=out)

    cost1, cost2 = predictions(nd, nk1, nk2)

    print('Cost1: %0.2f' % cost1, file=out)
    print('Cost2: %0.2f' % cost2, file=out)
    print("Prediction:", file=out)
    if cost1 > cost2:
        print("Algorithm 2 %0.2f times faster than Algorithm 1" %
              (cost1 / cost2),
              file=out)
    else:
        print("Algorithm 1 %0.2f times faster than Algorithm 2" %
              (cost2 / cost1),
              file=out)

    print("", file=out)
    print("Setting up indexes...", file=out)
    for fn in glob.glob(fname + "*"):
        os.remove(fn)

    manager = ConnectionManager()
    factory = FileStorageCatalogFactory(fname, 'intersection')
    catalog = factory(manager)

    catalog['one'] = CatalogFieldIndex('one')
    catalog['two'] = CatalogFieldIndex('two')

    class Document(object):
        def __init__(self, docid):
            self.one = str(docid % nk1)
            self.two = str(docid % nk2)

    for docid in xrange(nd):
        catalog.index_doc(docid, Document(docid))
    manager.commit()
    manager.close()

    N_QUERIES = 1000
    print("Running %d queries for each algorithm..." % N_QUERIES, file=out)
    catalog = factory(manager)
    for _ in xrange(1000):
        key1 = random.randrange(nk1)
        key2 = random.randrange(nk2)
        query1 = Intersection1(Eq('one', str(key1)), Eq('two', str(key2)))
        query2 = Intersection2(Eq('one', str(key1)), Eq('two', str(key2)))

        start = time.time()
        result1 = query1.apply(catalog)
        cumulative1 += time.time() - start

        start = time.time()
        result2 = query2.apply(catalog)
        cumulative2 += time.time() - start

        s1 = sorted(list(result1))
        s2 = sorted(list(result2))

        assert s1 == s2, (s1, s2)

    manager.close()
    for fn in glob.glob(fname + "*"):
        os.remove(fn)

    print("", file=out)
    print("Result:", file=out)
    print("Time for algorithm1: %0.3f s" % cumulative1, file=out)
    print("Time for algorithm2: %0.3f s" % cumulative2, file=out)
    if cumulative1 > cumulative2:
        print("Algorithm 2 %0.2f times faster than Algorithm 1" %
              (cumulative1 / cumulative2),
              file=out)
    else:
        print("Algorithm 1 %0.2f times faster than Algorithm 2" %
              (cumulative2 / cumulative1),
              file=out)
    return cost1 / cost2, cumulative1 / cumulative2
Exemplo n.º 5
0
def do_benchmark(fname, nd, nk1, nk2, out=sys.stdout):
    cumulative1 = 0.0
    cumulative2 = 0.0

    print >>out, "Index 1:"
    print >>out, "\t# docs: %d" % nd
    print >>out, "\t# distinct keys: %d" % nk1
    print >>out, "Index 2:"
    print >>out, "\t# docs: %d" % nd
    print >>out, "\t# distinct keys: %d" % nk2
    print >>out, ""

    cost1, cost2 = predictions(nd, nk1, nk2)

    print >>out, 'Cost1: %0.2f' % cost1
    print >>out, 'Cost2: %0.2f' % cost2
    print >>out
    print >>out, "Prediction:"
    if cost1 > cost2:
        print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % (
            cost1/cost2)
    else:
        print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % (
            cost2/cost1)

    print >>out, ""
    print >>out, "Setting up indexes..."
    for fn in glob.glob(fname + "*"):
        os.remove(fn)

    manager = ConnectionManager()
    factory = FileStorageCatalogFactory(fname, 'intersection')
    catalog = factory(manager)

    catalog['one'] = CatalogFieldIndex('one')
    catalog['two'] = CatalogFieldIndex('two')

    class Document(object):
        def __init__(self, docid):
            self.one = str(docid % nk1)
            self.two = str(docid % nk2)

    for docid in xrange(nd):
        catalog.index_doc(docid, Document(docid))
    manager.commit()
    manager.close()

    N_QUERIES = 1000
    print >>out, "Running %d queries for each algorithm..." % N_QUERIES
    catalog = factory(manager)
    for _ in xrange(1000):
        key1 = random.randrange(nk1)
        key2 = random.randrange(nk2)
        query1 = Intersection1(Eq('one', str(key1)), Eq('two', str(key2)))
        query2 = Intersection2(Eq('one', str(key1)), Eq('two', str(key2)))

        start = time.time()
        result1 = query1.apply(catalog)
        cumulative1 += time.time() - start

        start = time.time()
        result2 = query2.apply(catalog)
        cumulative2 += time.time() - start

        s1 = sorted(list(result1))
        s2 = sorted(list(result2))

        assert s1==s2, (s1, s2)

    manager.close()
    for fn in glob.glob(fname + "*"):
        os.remove(fn)

    print >>out, ""
    print >>out, "Result:"
    print >>out, "Time for algorithm1: %0.3f s" % cumulative1
    print >>out, "Time for algorithm2: %0.3f s" % cumulative2
    if cumulative1 > cumulative2:
        print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % (
            cumulative1/cumulative2)
    else:
        print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % (
            cumulative2/cumulative1)
    return cost1 / cost2, cumulative1 / cumulative2