def run_benchmark(num_vals=10): tmp = doctable.TempFolder('tmp') timer = doctable.Timer('creating databases', logfile=tmp.joinpath('log.txt')) db1 = doctable.DocTable(schema=TestObj1, target=tmp.joinpath('1.db'), new_db=True) db2 = doctable.DocTable(schema=TestObj2, target=tmp.joinpath('2.db'), new_db=True) db2.clean_col_files('data') timer.step('creating synthetic data') data1 = [TestObj1(i) for i in range(num_vals)] data2 = [TestObj2(i) for i in range(num_vals)] timer.step('insert into table directly') db1.insert(data1) timer.step('insert into a column file') db2.insert(data2) timer.step('finished inserting') print(f'===========================================') print(f'===== Total took: {timer.total_diff()} =================') print(f'===========================================')
def test_parsetreedocs(): nlp = spacy.load('en_core_web_sm', disable=['ner']) tmp = doctable.TempFolder('tmp') db = doctable.DocTable(schema=TestRow, target=':memory:') spacydocs = [nlp(t) for t in ex_sents] docs = [doctable.ParseTreeDoc.from_spacy(sd) for sd in spacydocs] db.insert([{'doc': doc} for doc in docs]) # select the documents back sdocs = [r['doc'] for r in db.select()] assert (isinstance(sdocs[0], doctable.ParseTreeDoc)) for doc, new_doc in zip(docs, sdocs): print(repr(doc)) assert (repr(doc) == repr(new_doc)) for sent, new_sent in zip(doc, new_doc): assert (repr(sent) == repr(new_sent)) # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): sent.root.ent == '' # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): new_sent.root.ent == ''
def make_db(): schema = ( ('integer', 'id', dict(primary_key=True)), ('string', 'title', dict(unique=True)), ('float', 'age'), ) return doctable.DocTable(target=':memory:', schema=schema)
def test_insert_single1(n=20): rows = gen_data1(n) db = doctable.DocTable(target=':memory:', schema=schema1) for r in rows: db.insert(r) assert (check_db(rows, db, show=False))
def test_init_errors(): # never actually creates this database randdbname = 'randomdbname2349io8oipaudjrtfoajsd.db' # new memory db with no schema with pytest.raises(ValueError): doctable.DocTable(target=':memory:') print(doctable.DocTable(target=':memory:', schema=schema)) # new db with no schema with pytest.raises(ValueError): doctable.DocTable(target=randdbname, new_db=True) #print(doctable.DocTable(target=randdbname, schema=schema)) # don't want to make db but one doesn't exist with pytest.raises(FileNotFoundError): doctable.ConnectEngine(target=randdbname, new_db=False)
def new_db(): db = doctable.DocTable(target=':memory:', schema=( ('integer', 'id', dict(primary_key=True)), ('string', 'title', dict(unique=True)), ('integer', 'year'), )) return db
self.name4 = str(self.id) self.name5 = str(self.id) self.name6 = str(self.id) self.name7 = str(self.id) self.name8 = str(self.id) self.name9 = str(self.id) self.name10 = str(self.id) if __name__ == '__main__': # create doctable folder = 'tmp_dataclass_benchmark' tmpf = doctable.TempFolder(folder) db = doctable.DocTable(schema=DataObj, target=f'{folder}/test.db', new_db=True) # make data payload payload = [DataObj(i) for i in range(100000)] dict_payload = [o._doctable_as_dict() for o in payload] print(f'==== DataClass ====') print( f'\t\tDataclass Insert: {timing.time_call(lambda: db.insert(payload))}' ) print(f'\t\tDataclass Select: {timing.time_call(lambda: db.select())}') db.delete() print(f'==== Dict ====') print(
def test_insert_many1(n=20): rows = gen_data1(n) db = doctable.DocTable(target=':memory:', schema=schema1) db.insert(rows) assert (check_db(rows, db, show=False))
if __name__ == '__main__': mc = MyClass() for col in doctable.parse_schema_dataclass(MyClass): print(col) #for col in mc.sqlalchemy_columns(): # print(col) print(mc) print(mc.lon) try: print(mc['lon']) except Exception as e: print(e) db = doctable.DocTable(target=':memory:', schema=MyClass) db.insert(MyClass('hahhahha', elements='l o l ha ha ha'.split())) db.insert(MyClass('whatever')) print(db.head()) print(db.schema_table()) print(isinstance(mc, doctable.DocTableRow)) for row in db.select(['idx', 'name']): print(f"{row.idx}: {row.name}") #print(fields(MyBaseClass)) #mt = MyTable() #print(fields(mt.dclass)) #print(fields(mc)) #print(mc['name']) #print(mc) #print(mc.name) #print(fields(MyClass))
import doctable db = doctable.DocTable(fname='metadata2.sqlite') print(db)
def main(): # set up database objects #folder = '/tmp/devintest' tmpf = doctable.TempFolder(folder) target = f'{folder}/benchmark_fileobj.db' ddb = doctable.DocTable(schema=DataObj, target=target, tabname='dataobj', new_db=True) ddb.delete() test_data = Test(ddb, DataObj) fdb = doctable.DocTable(schema=FileObj, target=target, tabname='fileobj', new_db=True) fdb.delete() # empty datbases fdb.clean_col_files('data') test_file = Test(fdb, FileObj) print(ddb, fdb) params = [ (0.00001, 100), (0.00001, 500), (0.00001, 1000), (0.00001, 10000), (0.00001, 100000), (0.0001, 100), (0.0001, 1000), (0.0001, 10000), (0.001, 100), (0.001, 500), (0.001, 1000), (0.01, 10), (0.01, 50), (0.01, 100), (0.1, 3), (0.1, 5), (0.1, 10), (1.0, 1), (1.0, 3), (1.0, 5), ] import itertools params = list( itertools.product([0.001, 0.005, 0.01, 0.05], [10, 50, 100, 250, 500])) print(f'running {len(params)} param sets') for sizeGB, num in params: print( f'========== Running Test (sizeGB={sizeGB}, num={num}) ==========') test_data.run_test(sizeGB=sizeGB, num=num) #pprint(test_data.last) test_file.run_test(sizeGB=sizeGB, num=num) #pprint(test_file.last) # clearing filesystem fdb.clean_col_files('data') # save updated version df = pd.DataFrame(test_data.results + test_file.results) df.to_csv('results/filecol_benchmark_results_hauss.csv', index=False) print()
self.name6 = str(self.arc) self.name7 = str(self.arc) self.name8 = str(self.arc) self.name9 = str(self.arc) self.name10 = str(self.arc) if __name__ == '__main__': timer = doctable.Timer() timer.step('creating dbs') # no idea why I can't get this to work?? #?unix_socket="/var/run/mysqld/mysqld.sock" #devin:@localhost:3306/ mdb = doctable.DocTable(schema=DataObj, target='nonprofits', dialect='mysql+pymysql', new_db=True) #mdb.insert({'arc':5}) folder = 'tmp_mongo' tmpf = doctable.TempFolder(folder) db = doctable.DocTable(schema=DataObj, target=f'{folder}/test.db', new_db=True) db.delete() timer.step('creating mongo db') myclient = pymongo.MongoClient("mongodb://localhost:27017") ndb = myclient["nonprofits"] test_col = ndb["test"] #test_col.delete_many({}) print(f'mongo contents: {test_col.count_documents({})}')