def stream_tasks(inDB_path, num_task, num_proc): ''' This procedure generates all vector pairs and stores them in a ShareDB instance located at inDB_path. Once all the work is queued up, the task end tokens are issued. ''' # Open inDB to write vector pairs inDB = ShareDB( path=inDB_path, # Store output where specified reset=True, # Recreate inDB if previously existing serial='msgpack', # msgpack offers optimal serialization for lists readers=num_proc+1, # Allow all (num_proc) processes to read in parallel compress=True, # Serialized msgpack-ed lists are further compressed map_size=5*10**8) # And we estimate to require ~500MB for results # Queue all work current_token = 0 while current_token < num_task: # Choose vector size base_size = np.random.randint(10**4, 10**5) # Generate vectors X = tuple(int(v) for v in np.random.randint(100, size=(1, base_size))[0]) Y = tuple(int(v) for v in np.random.randint(100, size=(1, base_size))[0]) # Write to inDB inDB[current_token] = (X, Y) print('QUEUED WORK # {}'.format(current_token)) current_token += 1 # Jump to next token # Close inDB inDB.close()
def merge_results(mergeDB_path, outDB_paths): ''' This procedure simply merges all individual outDBs into a single ShareDB instance stored at mergeDB_path. ''' # Open mergeDB to store merged convolution results mergeDB = ShareDB( path=mergeDB_path, # Store output where specified reset=True, # Recreate outDB if previously existing serial='msgpack', # msgpack offers optimal serialization for lists readers=2, # At most 2 processes would read outDB in parallel compress=True, # Serialized msgpack-ed lists are further compressed map_size=5*10**8) # And we estimate to require ~500MB for results # # Open and chain all key-value pairs in results # outDB_list = (ShareDB(path) for path in outDB_paths) # results = chain.from_iterable( # outDB.items() for outDB in outDB_list) # # Merge all individual results # mergeDB.multiset(results) # # All results merged ... we're done! # mergeDB.close() # Merge all individual results for outDB_path in outDB_paths: outDB = ShareDB(outDB_path) mergeDB.multiset(outDB.items()) print('Merged results = {}'.format(len(mergeDB))) # All results merged ... we're done! mergeDB.close()
def get_myDB_resources(total): ''' Initialize a populated ShareDB instance and associated resources. ''' # Setup random seed = random.random() gri_stream = gri(seed=seed) # Initialize ShareDB instance myDB = ShareDB(path='./myDB', reset=True, serial='msgpack', compress=random.choice([True, False]), readers=40, buffer_size=100, map_size=10**7) # Populate myDB with random items and record keys key_val_dict = {} while len(key_val_dict) < total: key = next(gri_stream) val = next(gri_stream) if key not in key_val_dict: myDB[key] = val key_val_dict[key] = val # Generate some keys not seen before non_key_set = set() while len(non_key_set) < total: non_key = next(gri_stream) if non_key not in key_val_dict: non_key_set.add(non_key) # Return resources return myDB, key_val_dict, non_key_set
def main(): store_path = './kvStore' num_items = 1000000 length = 25 write_thp(store_path, num_items, length) print('\n') read_thp(store_path) ShareDB(store_path).drop()
def get_tinyDB(map_size): ''' Initialize a ShareDB with msgpack serialization. ''' myDB = ShareDB(path='./tinyDB', reset=True, serial='msgpack', readers=40, buffer_size=100, map_size=map_size) return myDB
def read_thp(store_path): ''' Go through store and report reading throughput. ''' kvStore = ShareDB(store_path) i = 1. tt = 0.0 while i <= len(kvStore): t0 = time.time() val = kvStore[i] tt += time.time() - t0 print('READER thp @ {:.2f} rd/sec | SCAN {:.2f}%'.format( i / tt, (100. * i) / len(kvStore))) i += 1
def write_thp(store_path, num_items, length): ''' Fill store with random DNA strings and report throughput. ''' kvStore = ShareDB( store_path, True, 'msgpack', map_size=num_items*100) i = 1. tt = 0.0 while i <= num_items: t0 = time.time() kvStore[i] = 1 tt += time.time() - t0 print('WRITER thp @ {:.2f} wt/sec | FILL {:.2f}%'.format( i / tt, (100. * i) / num_items)) i += 1
def msgpack_myDB(): ''' Initialize a ShareDB with msgpack serialization. ''' msgpack_myDB = ShareDB(path='./myDB.msgpack', reset=True, serial='msgpack', compress=True, readers=40, buffer_size=100, map_size=10**7) msgpack_myDB = None msgpack_myDB = ShareDB(path='./myDB.msgpack', reset=False) yield msgpack_myDB msgpack_myDB.drop()
def pickle_myDB(): ''' Initialize a ShareDB with pickle serialization. ''' pickle_myDB = ShareDB(path='./myDB.pickle', reset=True, serial='pickle', compress=True, readers=40, buffer_size=100, map_size=10**7) pickle_myDB = None pickle_myDB = ShareDB(path='./myDB.pickle', reset=False) yield pickle_myDB pickle_myDB.drop()
def test_ShareDB_path(): ''' Test Exceptions and success when path is occupied by file. ''' # Setup ShareDB init fail via file occupancy path = './test_init.ShareDB' with open(path, 'w') as outfile: pass # Raises TypeError because path points to a file with pytest.raises(TypeError) as error: myDB = ShareDB(path=path) # Automatically remove file when reset is True myDB = ShareDB(path=path, reset=True) myDB.drop()
def para_conv(inDB_path, outDB_path, exec_id, num_task, num_proc): ''' This procedure computes the convolution of vector pairs stored in a ShareDB instance located at inDB_path, and writes the results in a ShareDB instance located in outDB_path. The procedure ends when no more tasks are available and a relevant task end token is found. ''' # Open inDB to read vector pairs inDB = ShareDB(path=inDB_path) # Open outDB to write convolution results outDB = ShareDB( path=outDB_path, # Store output where specified reset=True, # Recreate outDB if previously existing serial='msgpack', # msgpack offers optimal serialization for lists readers=num_proc+1, # At most 2 processes would read outDB in parallel compress=True, # Serialized msgpack-ed lists are further compressed map_size=5*10**8 // num_proc) # And we split total allocation uniformly # Actual computation loop key_iter = stream_task_token(exec_id, num_task, num_proc) current_token = exec_id # Get vector pairs for X, Y in inDB.multiget(key_iter): # Log execution initation print('EXECUTING WORK # {}'.format(current_token)) # Actual execution result = tuple(int(v) for v in np.convolve(X, Y)) # Compute and store result in a list outDB[current_token] = result # Insert compressed result in outDB # Log execution computation print('COMPLETED WORK # {}'.format(current_token)) # Update token for logging current_token += num_proc # Log executor completion print('EXECUTOR # {} COMPLETED'.format(exec_id)) # Time to close outDB ... we're done! outDB.close()
def test_close_drop(): ''' Test close and drop. ''' # Successful close myDB = ShareDB(path='./test_close_drop') assert myDB.close() == True assert myDB.close() == False # Once closed transaction raises RuntimeError with pytest.raises(RuntimeError) as error: myDB[1] = 2 # Successful drop myDB = ShareDB(path='./test_close_drop', reset=True) assert myDB.drop() == True assert myDB.drop() == False # Once dropped transaction raises RuntimeError with pytest.raises(RuntimeError) as error: myDB[1] = 2
def test_ShareDB_init_param_fails(): ''' Test Exceptions to be raised on bad instantiation. ''' with pytest.raises(TypeError) as error: myDB = ShareDB() myDB.drop() with pytest.raises(TypeError) as error: myDB = ShareDB(path=True) myDB.drop() with pytest.raises(TypeError) as error: myDB = ShareDB(path=123) myDB.drop() with pytest.raises(TypeError) as error: myDB = ShareDB(path='./test_init.ShareDB', reset=True, serial='something_fancy') with pytest.raises(TypeError) as error: myDB = ShareDB(path='./test_init.ShareDB', reset=True, serial='pickle', compress='AbsoluteTruth') with pytest.raises(TypeError) as error: myDB = ShareDB(path='./test_init.ShareDB', reset=True, readers='XYZ', buffer_size=100, map_size=10**3) with pytest.raises(TypeError) as error: myDB = ShareDB(path='./test_init.ShareDB', reset=True, readers='XYZ', buffer_size=100, map_size=0) myDB = ShareDB(path='./test_init.ShareDB', reset=True) myDB.drop()