connection = Connection() db = connection[DB_NAME] coll = db[COLL_NAME] #remove everything in the collection coll.remove() #populate database with data/biz.dat for line in open('data/biz.dat'): binfo_l = map(lambda s:s.strip(), line.split('\t')) assert(len(binfo_l) == 2) coll.insert({'name':binfo_l[0], 'url':binfo_l[1]}) #generate bloom filter based on 'url' bfcoll = db[COLL_NAME] #get another collection object bfcoll = bloomify(bfcoll, 'url') #generate test files T1 = 'data/testset'; gendata(1000,0.5, output_file=open(T1,'w')) #function to performtimed tests def timed_test(testf_path, coll): tic = time.clock() for line in open(testf_path): binfo_l = map(lambda l:l.strip(), line.split("\t")) assert(len(binfo_l)==2) [bname, burl] = binfo_l coll.find_one({'url':burl}) toc = time.clock() return toc-tic
connection = Connection() db = connection[DB_NAME] coll = db[COLL_NAME] #remove everything in the collection coll.remove() data = [{"name":"user_a", "value":1}, {"name":"user_b", "value":2}, {"name":"user_c", "value":3}, {"name":"user_d", "value":4} ] #populate collection with documents (data) coll.insert(data) #generate bloom filter based on 'name' bfcoll = bloomify(coll, "name") #checks (all elements will be within the set): for doc in data: assert( coll._bf.contains(doc["name"]) ) #will return True assert( None != coll.find_one(doc) ) #will not return None #attempt to find something not in the database qry = {"name":"XXXy"} assert( not(coll._bf.contains("XXXy")) ) #SHOULD return False since XXX not in coll assert( None == coll.find_one(qry) ) #will return None object since XXX not in coll