Пример #1
0
def test_basic():
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    tmp = doctable.TempFolder('tmp')

    # verify operation by pickling/dicting and undicting
    trees = list()
    for sent in nlp(ex_sents).sents:

        tree = doctable.ParseTree.from_spacy(sent)
        trees.append(tree)
        print(sent)
        print(tree)

        assert (len(tree) == len(sent))

        fname = 'test_tree.pic'
        with tmp.path.joinpath(fname).open('wb') as f:
            f.write(tree.as_pickle())

        with tmp.path.joinpath(fname).open('rb') as f:
            othertree = doctable.ParseTree.from_pickle(f.read())

        assert (repr(tree) == repr(othertree))

    # now work with single tree
    tree = trees[0]
    assert (tree.root.text == 'is')
    assert (tree.root.tag == 'VBZ')

    # recall that ner was disabled
    with pytest.raises(doctable.textmodels.PropertyNotAvailable):
        tree.root.ent == ''
Пример #2
0
def run_benchmark(num_vals=10):

    tmp = doctable.TempFolder('tmp')

    timer = doctable.Timer('creating databases',
                           logfile=tmp.joinpath('log.txt'))
    db1 = doctable.DocTable(schema=TestObj1,
                            target=tmp.joinpath('1.db'),
                            new_db=True)
    db2 = doctable.DocTable(schema=TestObj2,
                            target=tmp.joinpath('2.db'),
                            new_db=True)
    db2.clean_col_files('data')

    timer.step('creating synthetic data')
    data1 = [TestObj1(i) for i in range(num_vals)]
    data2 = [TestObj2(i) for i in range(num_vals)]

    timer.step('insert into table directly')
    db1.insert(data1)

    timer.step('insert into a column file')
    db2.insert(data2)

    timer.step('finished inserting')

    print(f'===========================================')
    print(f'===== Total took: {timer.total_diff()} =================')
    print(f'===========================================')
Пример #3
0
def test_parsetreedocs():
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    tmp = doctable.TempFolder('tmp')
    db = doctable.DocTable(schema=TestRow, target=':memory:')

    spacydocs = [nlp(t) for t in ex_sents]
    docs = [doctable.ParseTreeDoc.from_spacy(sd) for sd in spacydocs]
    db.insert([{'doc': doc} for doc in docs])

    # select the documents back
    sdocs = [r['doc'] for r in db.select()]
    assert (isinstance(sdocs[0], doctable.ParseTreeDoc))

    for doc, new_doc in zip(docs, sdocs):
        print(repr(doc))
        assert (repr(doc) == repr(new_doc))

        for sent, new_sent in zip(doc, new_doc):

            assert (repr(sent) == repr(new_sent))

            # recall that ner was disabled
            with pytest.raises(doctable.textmodels.PropertyNotAvailable):
                sent.root.ent == ''

            # recall that ner was disabled
            with pytest.raises(doctable.textmodels.PropertyNotAvailable):
                new_sent.root.ent == ''
Пример #4
0
def test_parsetreedocs():
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    tmp = doctable.TempFolder('tmp')

    spacydoc = nlp(ex_sents)
    doc = doctable.ParseTreeDoc.from_spacy(spacydoc)
    doc_dict = doc.as_dict()
    new_doc = doctable.ParseTreeDoc.from_dict(doc_dict)

    #print(doc[0])
    #print(len(doc), len(list(spacydoc.sents)))
    assert (len(list(doc.tokens)) == len(list(spacydoc)))
    assert (len(doc) == len(list(spacydoc.sents)))
    assert (repr(doc) == repr(new_doc))

    for sent, new_sent in zip(doc, new_doc):

        assert (repr(sent) == repr(new_sent))

        # recall that ner was disabled
        with pytest.raises(doctable.textmodels.PropertyNotAvailable):
            sent.root.ent == ''

        # recall that ner was disabled
        with pytest.raises(doctable.textmodels.PropertyNotAvailable):
            new_sent.root.ent == ''
Пример #5
0
def test_timer():
    tmp = doctable.TempFolder('logs')

    def test_func(sec=0.02):
        time.sleep(sec)

    #    return sum(i for i in range(n))

    with doctable.Timer('trying out enter and exit',
                        logfile=tmp.path.joinpath('0.log')):
        test_func()

    timer = doctable.Timer('testing verbose stepping')
    timer.step('running one thing')
    test_func()
    timer.step()
    test_func()
    timer.step('running last thing')

    timer = doctable.Timer('testing non-verbose stepping',
                           verbose=False,
                           logfile=tmp.path.joinpath('2.log'))
    test_func()

    timer.step('whatever this step is')
    test_func()

    timer.step('next step')
    test_func()

    timer.step('that\'s all folks.')

    timer = doctable.Timer(verbose=False)
    for i in range(10):
        time.sleep(0.01)
        timer.step()

    mean = timer.get_diff_stat(stat='mean', as_str=False)
    assert (mean >= 0.01 and mean < 0.011)

    med = timer.get_diff_stat(stat='median', as_str=False)
    assert (mean >= 0.01 and mean < 0.011)

    stdev = timer.get_diff_stat(stat='stdev', as_str=False)
    assert (stdev > 0 and stdev <= 0.001)
    print(mean, med, stdev)

    print(doctable.Timer.time_call(lambda: time.sleep(0.001), num_calls=10))
    print(
        doctable.Timer.time_call(lambda: time.sleep(0.001),
                                 num_calls=10,
                                 as_str=True))
Пример #6
0
def test_engine_basics():
    with doctable.TempFolder('tmp') as tmp:
        
        eng = doctable.ConnectEngine(target='tmp/tmp_984237.db', new_db=True)
        assert(len(eng.list_tables())==0)
        print(eng)

        pdb = ParentTable(engine=eng)
        cdb = pdb.get_children_table()
        print(pdb, cdb)
        assert(len(eng.list_tables())==2)

        pdb.insert(Parent(name='whateva'))
        cdb.insert(Child(name='whateva child', parent_name='whateva'))
Пример #7
0
def test_tempfolder():
    folder = 'tmp'
    p = pathlib.Path(folder)
    if os.path.exists(p):
        shutil.rmtree(p)

    with doctable.TempFolder(p):
        assert (os.path.exists(p))
    assert (not os.path.exists(p))

    tmp = doctable.TempFolder(p, make_folder=True)
    with open(tmp.joinpath('file.txt'), 'w') as f:
        f.write('')
    assert (len(tmp.rglob('*.txt')) == 1)
    tmp.rmtree()
    assert (not os.path.exists(tmp.path))

    tmp = doctable.TempFolder(p, make_folder=True)
    assert (not os.path.exists(tmp.path))
    tmp.mkdir()
    assert (os.path.exists(tmp.path))
    tmp.rmtree()
    assert (not os.path.exists(tmp.path))
Пример #8
0
            self.name2 = str(self.id)
            self.name3 = str(self.id)
            self.name4 = str(self.id)
            self.name5 = str(self.id)
            self.name6 = str(self.id)
            self.name7 = str(self.id)
            self.name8 = str(self.id)
            self.name9 = str(self.id)
            self.name10 = str(self.id)


if __name__ == '__main__':

    # create doctable
    folder = 'tmp_dataclass_benchmark'
    tmpf = doctable.TempFolder(folder)
    db = doctable.DocTable(schema=DataObj,
                           target=f'{folder}/test.db',
                           new_db=True)

    # make data payload
    payload = [DataObj(i) for i in range(100000)]
    dict_payload = [o._doctable_as_dict() for o in payload]

    print(f'==== DataClass ====')
    print(
        f'\t\tDataclass Insert: {timing.time_call(lambda: db.insert(payload))}'
    )
    print(f'\t\tDataclass Select: {timing.time_call(lambda: db.select())}')
    db.delete()
Пример #9
0
        # saving for future use
        cache_path.write_bytes(pickle.dumps(docs))
    else:
        timer.step('cache file found - now reading')
        docs = pickle.loads(cache_path.read_bytes())

    timer.step('create list of parsetrees')
    trees = [
        doctable.ParseTree.from_spacy(sent) for doc in docs
        for sent in doc.sents
    ]
    print(trees[2])

    timer.step('creating file paths')
    tmp = doctable.TempFolder('tmp_parsetrees')
    fpaths = [tmp.path / f'{i}.pic' for i in range(len(trees))]

    timer.step('testing dictionary-based method')
    f = lambda: write_trees_pickle(trees, fpaths, use_dict=True)
    print(f'dict-based write: {timing.time_call(f)}')
    print(f'av filesize: {av_file_size(fpaths)/1000:0.2f} kB')
    f = lambda: read_trees_pickle(fpaths, use_dict=True)
    print(f'dict-based read: {timing.time_call(f)}')

    timer.step('cleaning up files')
    for fpath in fpaths:
        fpath.unlink()

    timer.step('testing raw pickle method')
    f = lambda: write_trees_pickle(trees, fpaths, use_dict=False)
Пример #10
0
def main():

    # set up database objects
    #folder = '/tmp/devintest'

    tmpf = doctable.TempFolder(folder)

    target = f'{folder}/benchmark_fileobj.db'
    ddb = doctable.DocTable(schema=DataObj,
                            target=target,
                            tabname='dataobj',
                            new_db=True)
    ddb.delete()
    test_data = Test(ddb, DataObj)

    fdb = doctable.DocTable(schema=FileObj,
                            target=target,
                            tabname='fileobj',
                            new_db=True)
    fdb.delete()  # empty datbases
    fdb.clean_col_files('data')
    test_file = Test(fdb, FileObj)
    print(ddb, fdb)

    params = [
        (0.00001, 100),
        (0.00001, 500),
        (0.00001, 1000),
        (0.00001, 10000),
        (0.00001, 100000),
        (0.0001, 100),
        (0.0001, 1000),
        (0.0001, 10000),
        (0.001, 100),
        (0.001, 500),
        (0.001, 1000),
        (0.01, 10),
        (0.01, 50),
        (0.01, 100),
        (0.1, 3),
        (0.1, 5),
        (0.1, 10),
        (1.0, 1),
        (1.0, 3),
        (1.0, 5),
    ]

    import itertools
    params = list(
        itertools.product([0.001, 0.005, 0.01, 0.05], [10, 50, 100, 250, 500]))
    print(f'running {len(params)} param sets')

    for sizeGB, num in params:
        print(
            f'========== Running Test (sizeGB={sizeGB}, num={num}) ==========')

        test_data.run_test(sizeGB=sizeGB, num=num)
        #pprint(test_data.last)

        test_file.run_test(sizeGB=sizeGB, num=num)
        #pprint(test_file.last)

        # clearing filesystem
        fdb.clean_col_files('data')

        # save updated version
        df = pd.DataFrame(test_data.results + test_file.results)
        df.to_csv('results/filecol_benchmark_results_hauss.csv', index=False)

        print()