예제 #1
0
def run_benchmark(num_vals=10):

    tmp = doctable.TempFolder('tmp')

    timer = doctable.Timer('creating databases',
                           logfile=tmp.joinpath('log.txt'))
    db1 = doctable.DocTable(schema=TestObj1,
                            target=tmp.joinpath('1.db'),
                            new_db=True)
    db2 = doctable.DocTable(schema=TestObj2,
                            target=tmp.joinpath('2.db'),
                            new_db=True)
    db2.clean_col_files('data')

    timer.step('creating synthetic data')
    data1 = [TestObj1(i) for i in range(num_vals)]
    data2 = [TestObj2(i) for i in range(num_vals)]

    timer.step('insert into table directly')
    db1.insert(data1)

    timer.step('insert into a column file')
    db2.insert(data2)

    timer.step('finished inserting')

    print(f'===========================================')
    print(f'===== Total took: {timer.total_diff()} =================')
    print(f'===========================================')
예제 #2
0
def test_parsetreedocs():
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    tmp = doctable.TempFolder('tmp')
    db = doctable.DocTable(schema=TestRow, target=':memory:')

    spacydocs = [nlp(t) for t in ex_sents]
    docs = [doctable.ParseTreeDoc.from_spacy(sd) for sd in spacydocs]
    db.insert([{'doc': doc} for doc in docs])

    # select the documents back
    sdocs = [r['doc'] for r in db.select()]
    assert (isinstance(sdocs[0], doctable.ParseTreeDoc))

    for doc, new_doc in zip(docs, sdocs):
        print(repr(doc))
        assert (repr(doc) == repr(new_doc))

        for sent, new_sent in zip(doc, new_doc):

            assert (repr(sent) == repr(new_sent))

            # recall that ner was disabled
            with pytest.raises(doctable.textmodels.PropertyNotAvailable):
                sent.root.ent == ''

            # recall that ner was disabled
            with pytest.raises(doctable.textmodels.PropertyNotAvailable):
                new_sent.root.ent == ''
예제 #3
0
def make_db():
    schema = (
        ('integer', 'id', dict(primary_key=True)),
        ('string', 'title', dict(unique=True)),
        ('float', 'age'),
    )
    return doctable.DocTable(target=':memory:', schema=schema)
예제 #4
0
def test_insert_single1(n=20):
    rows = gen_data1(n)
    db = doctable.DocTable(target=':memory:', schema=schema1)
    for r in rows:
        db.insert(r)

    assert (check_db(rows, db, show=False))
예제 #5
0
def test_init_errors():
    # never actually creates this database
    randdbname = 'randomdbname2349io8oipaudjrtfoajsd.db'

    # new memory db with no schema
    with pytest.raises(ValueError):
        doctable.DocTable(target=':memory:')
    print(doctable.DocTable(target=':memory:', schema=schema))

    # new db with no schema
    with pytest.raises(ValueError):
        doctable.DocTable(target=randdbname, new_db=True)
    #print(doctable.DocTable(target=randdbname, schema=schema))

    # don't want to make db but one doesn't exist
    with pytest.raises(FileNotFoundError):
        doctable.ConnectEngine(target=randdbname, new_db=False)
예제 #6
0
def new_db():
    db = doctable.DocTable(target=':memory:',
                           schema=(
                               ('integer', 'id', dict(primary_key=True)),
                               ('string', 'title', dict(unique=True)),
                               ('integer', 'year'),
                           ))
    return db
예제 #7
0
            self.name4 = str(self.id)
            self.name5 = str(self.id)
            self.name6 = str(self.id)
            self.name7 = str(self.id)
            self.name8 = str(self.id)
            self.name9 = str(self.id)
            self.name10 = str(self.id)


if __name__ == '__main__':

    # create doctable
    folder = 'tmp_dataclass_benchmark'
    tmpf = doctable.TempFolder(folder)
    db = doctable.DocTable(schema=DataObj,
                           target=f'{folder}/test.db',
                           new_db=True)

    # make data payload
    payload = [DataObj(i) for i in range(100000)]
    dict_payload = [o._doctable_as_dict() for o in payload]

    print(f'==== DataClass ====')
    print(
        f'\t\tDataclass Insert: {timing.time_call(lambda: db.insert(payload))}'
    )
    print(f'\t\tDataclass Select: {timing.time_call(lambda: db.select())}')
    db.delete()

    print(f'==== Dict ====')
    print(
예제 #8
0
def test_insert_many1(n=20):
    rows = gen_data1(n)
    db = doctable.DocTable(target=':memory:', schema=schema1)
    db.insert(rows)
    assert (check_db(rows, db, show=False))
예제 #9
0
if __name__ == '__main__':
    mc = MyClass()
    for col in doctable.parse_schema_dataclass(MyClass):
        print(col)

    #for col in mc.sqlalchemy_columns():
    #    print(col)
    print(mc)
    print(mc.lon)
    try:
        print(mc['lon'])
    except Exception as e:
        print(e)

    db = doctable.DocTable(target=':memory:', schema=MyClass)
    db.insert(MyClass('hahhahha', elements='l o l ha ha ha'.split()))
    db.insert(MyClass('whatever'))
    print(db.head())
    print(db.schema_table())
    print(isinstance(mc, doctable.DocTableRow))
    for row in db.select(['idx', 'name']):
        print(f"{row.idx}: {row.name}")
    #print(fields(MyBaseClass))
    #mt = MyTable()
    #print(fields(mt.dclass))
    #print(fields(mc))
    #print(mc['name'])
    #print(mc)
    #print(mc.name)
    #print(fields(MyClass))
예제 #10
0


import doctable

db = doctable.DocTable(fname='metadata2.sqlite')
print(db)
예제 #11
0
def main():

    # set up database objects
    #folder = '/tmp/devintest'

    tmpf = doctable.TempFolder(folder)

    target = f'{folder}/benchmark_fileobj.db'
    ddb = doctable.DocTable(schema=DataObj,
                            target=target,
                            tabname='dataobj',
                            new_db=True)
    ddb.delete()
    test_data = Test(ddb, DataObj)

    fdb = doctable.DocTable(schema=FileObj,
                            target=target,
                            tabname='fileobj',
                            new_db=True)
    fdb.delete()  # empty datbases
    fdb.clean_col_files('data')
    test_file = Test(fdb, FileObj)
    print(ddb, fdb)

    params = [
        (0.00001, 100),
        (0.00001, 500),
        (0.00001, 1000),
        (0.00001, 10000),
        (0.00001, 100000),
        (0.0001, 100),
        (0.0001, 1000),
        (0.0001, 10000),
        (0.001, 100),
        (0.001, 500),
        (0.001, 1000),
        (0.01, 10),
        (0.01, 50),
        (0.01, 100),
        (0.1, 3),
        (0.1, 5),
        (0.1, 10),
        (1.0, 1),
        (1.0, 3),
        (1.0, 5),
    ]

    import itertools
    params = list(
        itertools.product([0.001, 0.005, 0.01, 0.05], [10, 50, 100, 250, 500]))
    print(f'running {len(params)} param sets')

    for sizeGB, num in params:
        print(
            f'========== Running Test (sizeGB={sizeGB}, num={num}) ==========')

        test_data.run_test(sizeGB=sizeGB, num=num)
        #pprint(test_data.last)

        test_file.run_test(sizeGB=sizeGB, num=num)
        #pprint(test_file.last)

        # clearing filesystem
        fdb.clean_col_files('data')

        # save updated version
        df = pd.DataFrame(test_data.results + test_file.results)
        df.to_csv('results/filecol_benchmark_results_hauss.csv', index=False)

        print()
예제 #12
0
            self.name6 = str(self.arc)
            self.name7 = str(self.arc)
            self.name8 = str(self.arc)
            self.name9 = str(self.arc)
            self.name10 = str(self.arc)


if __name__ == '__main__':
    timer = doctable.Timer()
    
    timer.step('creating dbs')
    
    # no idea why I can't get this to work??
    #?unix_socket="/var/run/mysqld/mysqld.sock"
    #devin:@localhost:3306/
    mdb = doctable.DocTable(schema=DataObj, target='nonprofits', dialect='mysql+pymysql', new_db=True)
    #mdb.insert({'arc':5})
    
    folder = 'tmp_mongo'
    tmpf = doctable.TempFolder(folder)
    db = doctable.DocTable(schema=DataObj, target=f'{folder}/test.db', new_db=True)
    db.delete()

    timer.step('creating mongo db')
    myclient = pymongo.MongoClient("mongodb://localhost:27017")
    ndb = myclient["nonprofits"]
    test_col = ndb["test"]
    #test_col.delete_many({})
    print(f'mongo contents: {test_col.count_documents({})}')