def test_basic(): nlp = spacy.load('en_core_web_sm', disable=['ner']) tmp = doctable.TempFolder('tmp') # verify operation by pickling/dicting and undicting trees = list() for sent in nlp(ex_sents).sents: tree = doctable.ParseTree.from_spacy(sent) trees.append(tree) print(sent) print(tree) assert (len(tree) == len(sent)) fname = 'test_tree.pic' with tmp.path.joinpath(fname).open('wb') as f: f.write(tree.as_pickle()) with tmp.path.joinpath(fname).open('rb') as f: othertree = doctable.ParseTree.from_pickle(f.read()) assert (repr(tree) == repr(othertree)) # now work with single tree tree = trees[0] assert (tree.root.text == 'is') assert (tree.root.tag == 'VBZ') # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): tree.root.ent == ''
def run_benchmark(num_vals=10): tmp = doctable.TempFolder('tmp') timer = doctable.Timer('creating databases', logfile=tmp.joinpath('log.txt')) db1 = doctable.DocTable(schema=TestObj1, target=tmp.joinpath('1.db'), new_db=True) db2 = doctable.DocTable(schema=TestObj2, target=tmp.joinpath('2.db'), new_db=True) db2.clean_col_files('data') timer.step('creating synthetic data') data1 = [TestObj1(i) for i in range(num_vals)] data2 = [TestObj2(i) for i in range(num_vals)] timer.step('insert into table directly') db1.insert(data1) timer.step('insert into a column file') db2.insert(data2) timer.step('finished inserting') print(f'===========================================') print(f'===== Total took: {timer.total_diff()} =================') print(f'===========================================')
def test_parsetreedocs(): nlp = spacy.load('en_core_web_sm', disable=['ner']) tmp = doctable.TempFolder('tmp') db = doctable.DocTable(schema=TestRow, target=':memory:') spacydocs = [nlp(t) for t in ex_sents] docs = [doctable.ParseTreeDoc.from_spacy(sd) for sd in spacydocs] db.insert([{'doc': doc} for doc in docs]) # select the documents back sdocs = [r['doc'] for r in db.select()] assert (isinstance(sdocs[0], doctable.ParseTreeDoc)) for doc, new_doc in zip(docs, sdocs): print(repr(doc)) assert (repr(doc) == repr(new_doc)) for sent, new_sent in zip(doc, new_doc): assert (repr(sent) == repr(new_sent)) # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): sent.root.ent == '' # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): new_sent.root.ent == ''
def test_parsetreedocs(): nlp = spacy.load('en_core_web_sm', disable=['ner']) tmp = doctable.TempFolder('tmp') spacydoc = nlp(ex_sents) doc = doctable.ParseTreeDoc.from_spacy(spacydoc) doc_dict = doc.as_dict() new_doc = doctable.ParseTreeDoc.from_dict(doc_dict) #print(doc[0]) #print(len(doc), len(list(spacydoc.sents))) assert (len(list(doc.tokens)) == len(list(spacydoc))) assert (len(doc) == len(list(spacydoc.sents))) assert (repr(doc) == repr(new_doc)) for sent, new_sent in zip(doc, new_doc): assert (repr(sent) == repr(new_sent)) # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): sent.root.ent == '' # recall that ner was disabled with pytest.raises(doctable.textmodels.PropertyNotAvailable): new_sent.root.ent == ''
def test_timer(): tmp = doctable.TempFolder('logs') def test_func(sec=0.02): time.sleep(sec) # return sum(i for i in range(n)) with doctable.Timer('trying out enter and exit', logfile=tmp.path.joinpath('0.log')): test_func() timer = doctable.Timer('testing verbose stepping') timer.step('running one thing') test_func() timer.step() test_func() timer.step('running last thing') timer = doctable.Timer('testing non-verbose stepping', verbose=False, logfile=tmp.path.joinpath('2.log')) test_func() timer.step('whatever this step is') test_func() timer.step('next step') test_func() timer.step('that\'s all folks.') timer = doctable.Timer(verbose=False) for i in range(10): time.sleep(0.01) timer.step() mean = timer.get_diff_stat(stat='mean', as_str=False) assert (mean >= 0.01 and mean < 0.011) med = timer.get_diff_stat(stat='median', as_str=False) assert (mean >= 0.01 and mean < 0.011) stdev = timer.get_diff_stat(stat='stdev', as_str=False) assert (stdev > 0 and stdev <= 0.001) print(mean, med, stdev) print(doctable.Timer.time_call(lambda: time.sleep(0.001), num_calls=10)) print( doctable.Timer.time_call(lambda: time.sleep(0.001), num_calls=10, as_str=True))
def test_engine_basics(): with doctable.TempFolder('tmp') as tmp: eng = doctable.ConnectEngine(target='tmp/tmp_984237.db', new_db=True) assert(len(eng.list_tables())==0) print(eng) pdb = ParentTable(engine=eng) cdb = pdb.get_children_table() print(pdb, cdb) assert(len(eng.list_tables())==2) pdb.insert(Parent(name='whateva')) cdb.insert(Child(name='whateva child', parent_name='whateva'))
def test_tempfolder(): folder = 'tmp' p = pathlib.Path(folder) if os.path.exists(p): shutil.rmtree(p) with doctable.TempFolder(p): assert (os.path.exists(p)) assert (not os.path.exists(p)) tmp = doctable.TempFolder(p, make_folder=True) with open(tmp.joinpath('file.txt'), 'w') as f: f.write('') assert (len(tmp.rglob('*.txt')) == 1) tmp.rmtree() assert (not os.path.exists(tmp.path)) tmp = doctable.TempFolder(p, make_folder=True) assert (not os.path.exists(tmp.path)) tmp.mkdir() assert (os.path.exists(tmp.path)) tmp.rmtree() assert (not os.path.exists(tmp.path))
self.name2 = str(self.id) self.name3 = str(self.id) self.name4 = str(self.id) self.name5 = str(self.id) self.name6 = str(self.id) self.name7 = str(self.id) self.name8 = str(self.id) self.name9 = str(self.id) self.name10 = str(self.id) if __name__ == '__main__': # create doctable folder = 'tmp_dataclass_benchmark' tmpf = doctable.TempFolder(folder) db = doctable.DocTable(schema=DataObj, target=f'{folder}/test.db', new_db=True) # make data payload payload = [DataObj(i) for i in range(100000)] dict_payload = [o._doctable_as_dict() for o in payload] print(f'==== DataClass ====') print( f'\t\tDataclass Insert: {timing.time_call(lambda: db.insert(payload))}' ) print(f'\t\tDataclass Select: {timing.time_call(lambda: db.select())}') db.delete()
# saving for future use cache_path.write_bytes(pickle.dumps(docs)) else: timer.step('cache file found - now reading') docs = pickle.loads(cache_path.read_bytes()) timer.step('create list of parsetrees') trees = [ doctable.ParseTree.from_spacy(sent) for doc in docs for sent in doc.sents ] print(trees[2]) timer.step('creating file paths') tmp = doctable.TempFolder('tmp_parsetrees') fpaths = [tmp.path / f'{i}.pic' for i in range(len(trees))] timer.step('testing dictionary-based method') f = lambda: write_trees_pickle(trees, fpaths, use_dict=True) print(f'dict-based write: {timing.time_call(f)}') print(f'av filesize: {av_file_size(fpaths)/1000:0.2f} kB') f = lambda: read_trees_pickle(fpaths, use_dict=True) print(f'dict-based read: {timing.time_call(f)}') timer.step('cleaning up files') for fpath in fpaths: fpath.unlink() timer.step('testing raw pickle method') f = lambda: write_trees_pickle(trees, fpaths, use_dict=False)
def main(): # set up database objects #folder = '/tmp/devintest' tmpf = doctable.TempFolder(folder) target = f'{folder}/benchmark_fileobj.db' ddb = doctable.DocTable(schema=DataObj, target=target, tabname='dataobj', new_db=True) ddb.delete() test_data = Test(ddb, DataObj) fdb = doctable.DocTable(schema=FileObj, target=target, tabname='fileobj', new_db=True) fdb.delete() # empty datbases fdb.clean_col_files('data') test_file = Test(fdb, FileObj) print(ddb, fdb) params = [ (0.00001, 100), (0.00001, 500), (0.00001, 1000), (0.00001, 10000), (0.00001, 100000), (0.0001, 100), (0.0001, 1000), (0.0001, 10000), (0.001, 100), (0.001, 500), (0.001, 1000), (0.01, 10), (0.01, 50), (0.01, 100), (0.1, 3), (0.1, 5), (0.1, 10), (1.0, 1), (1.0, 3), (1.0, 5), ] import itertools params = list( itertools.product([0.001, 0.005, 0.01, 0.05], [10, 50, 100, 250, 500])) print(f'running {len(params)} param sets') for sizeGB, num in params: print( f'========== Running Test (sizeGB={sizeGB}, num={num}) ==========') test_data.run_test(sizeGB=sizeGB, num=num) #pprint(test_data.last) test_file.run_test(sizeGB=sizeGB, num=num) #pprint(test_file.last) # clearing filesystem fdb.clean_col_files('data') # save updated version df = pd.DataFrame(test_data.results + test_file.results) df.to_csv('results/filecol_benchmark_results_hauss.csv', index=False) print()