def initialize_catalog(): global _initialized if not _initialized: # create a catalog manager = ConnectionManager() catalog = factory(manager) # set up indexes catalog['flavors'] = CatalogFieldIndex(get_flavor) catalog['text'] = CatalogTextIndex(get_text) # commit the indexes manager.commit() manager.close() _initialized = True
def initialize_catalog(): global _initialized if not _initialized: # create a catalog manager = ConnectionManager() catalog = factory(manager) # set up indexes catalog['flavors'] = CatalogFieldIndex('flavor') catalog['texts'] = CatalogTextIndex('text') # commit the indexes manager.commit() manager.close() _initialized = True
def prep_catalog(): """Download python mailing list, create new catalog and catalog messages, if not done already. """ if not os.path.exists(BENCHMARK_DATA_DIR): os.makedirs(BENCHMARK_DATA_DIR) # Check to see if mailing list data already present if len(get_mailbox_filenames()) == 0: MailListSucker(MAILLIST_INDEX,BENCHMARK_DATA_DIR).suck() # Create ZODB and index maillist messages, if not yet done zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb') if not os.path.exists(zodb_file): # Create a catalog manager = ConnectionManager() factory = FileStorageCatalogFactory( os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark' ) c = factory(manager) # Create some indices c['subject'] = CatalogFieldIndex(get_subject) c['date'] = CatalogFieldIndex(get_date) c['sender_email'] = CatalogFieldIndex(get_sender_email) c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy) c['text'] = CatalogTextIndex(get_text) manager.commit() # Loop over messages to get base line profiler.start( "Loop over messages without indexing" ) for _ in MessageIterator(): pass profiler.stop( "Loop over messages without indexing" ) profiler.start( "Index messages" ) id = 1 for msg in MessageIterator(): c.index_doc(id,msg) id += 1 if id / 100 == 0: manager.commit() manager.commit() manager.close() profiler.stop( "Index messages" ) print("Indexed %d messages" % id)
def prep_catalog(): """Download python mailing list, create new catalog and catalog messages, if not done already. """ if not os.path.exists(BENCHMARK_DATA_DIR): os.makedirs(BENCHMARK_DATA_DIR) # Check to see if mailing list data already present if len(get_mailbox_filenames()) == 0: MailListSucker(MAILLIST_INDEX, BENCHMARK_DATA_DIR).suck() # Create ZODB and index maillist messages, if not yet done zodb_file = os.path.join(BENCHMARK_DATA_DIR, 'test.zodb') if not os.path.exists(zodb_file): # Create a catalog manager = ConnectionManager() factory = FileStorageCatalogFactory( os.path.join(BENCHMARK_DATA_DIR, 'test.zodb'), 'benchmark') c = factory(manager) # Create some indices c['subject'] = CatalogFieldIndex(get_subject) c['date'] = CatalogFieldIndex(get_date) c['sender_email'] = CatalogFieldIndex(get_sender_email) c['topics'] = CatalogFacetIndex(get_topics, topic_taxonomy) c['text'] = CatalogTextIndex(get_text) manager.commit() # Loop over messages to get base line profiler.start("Loop over messages without indexing") for _ in MessageIterator(): pass profiler.stop("Loop over messages without indexing") profiler.start("Index messages") id = 1 for msg in MessageIterator(): c.index_doc(id, msg) id += 1 if id / 100 == 0: manager.commit() manager.commit() manager.close() profiler.stop("Index messages") print "Indexed %d messages" % id
def do_benchmark(fname, nd, nk1, nk2, out=sys.stdout): cumulative1 = 0.0 cumulative2 = 0.0 print >>out, "Index 1:" print >>out, "\t# docs: %d" % nd print >>out, "\t# distinct keys: %d" % nk1 print >>out, "Index 2:" print >>out, "\t# docs: %d" % nd print >>out, "\t# distinct keys: %d" % nk2 print >>out, "" cost1, cost2 = predictions(nd, nk1, nk2) print >>out, 'Cost1: %0.2f' % cost1 print >>out, 'Cost2: %0.2f' % cost2 print >>out print >>out, "Prediction:" if cost1 > cost2: print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % ( cost1/cost2) else: print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % ( cost2/cost1) print >>out, "" print >>out, "Setting up indexes..." for fn in glob.glob(fname + "*"): os.remove(fn) manager = ConnectionManager() factory = FileStorageCatalogFactory(fname, 'intersection') catalog = factory(manager) catalog['one'] = CatalogFieldIndex('one') catalog['two'] = CatalogFieldIndex('two') class Document(object): def __init__(self, docid): self.one = str(docid % nk1) self.two = str(docid % nk2) for docid in range(nd): catalog.index_doc(docid, Document(docid)) manager.commit() manager.close() N_QUERIES = 1000 print >>out, "Running %d queries for each algorithm..." % N_QUERIES catalog = factory(manager) for _ in range(1000): key1 = random.randrange(nk1) key2 = random.randrange(nk2) query1 = Intersection1(Eq('one', str(key1)), Eq('two', str(key2))) query2 = Intersection2(Eq('one', str(key1)), Eq('two', str(key2))) start = time.time() result1 = query1.apply(catalog) cumulative1 += time.time() - start start = time.time() result2 = query2.apply(catalog) cumulative2 += time.time() - start s1 = sorted(list(result1)) s2 = sorted(list(result2)) assert s1==s2, (s1, s2) manager.close() for fn in glob.glob(fname + "*"): os.remove(fn) print >>out, "" print >>out, "Result:" print >>out, "Time for algorithm1: %0.3f s" % cumulative1 print >>out, "Time for algorithm2: %0.3f s" % cumulative2 if cumulative1 > cumulative2: print >>out, "Algorithm 2 %0.2f times faster than Algorithm 1" % ( cumulative1/cumulative2) else: print >>out, "Algorithm 1 %0.2f times faster than Algorithm 2" % ( cumulative2/cumulative1) return cost1 / cost2, cumulative1 / cumulative2