def test_nullify(): with bayesdb_open(':memory:') as bdb: bdb.sql_execute('create table t(x,y)') for row in [ ['1',''], ['nan','foo'], ['2','nan'], ['2','""'], ['', ''], ]: bdb.sql_execute('insert into t values(?,?)', row) assert bdb.execute('select * from t').fetchall() == [ ('1',''), ('nan','foo'), ('2','nan'), ('2','""'), ('', ''), ] assert bayesdb_nullify(bdb, 't', '') == 3 assert bdb.execute('select * from t').fetchall() == [ ('1',None), ('nan','foo'), ('2','nan'), ('2','""'), (None, None), ] assert bayesdb_nullify(bdb, 't', 'nan', columns=['x']) == 1 assert bdb.execute('select * from t').fetchall() == [ ('1',None), (None,'foo'), ('2','nan'), ('2','""'), (None, None), ] assert bayesdb_nullify(bdb, 't', 'fnord') == 0
def loom_analyze(csv_filename): try: import loom except ImportError: pytest.skip('no loom') return with bayesdb_open(':memory:') as bdb: bdb = bayesdb_open(':memory:') bdb.execute('CREATE TABLE t FROM \'%s\'' % (csv_filename)) bayesdb_nullify(bdb, 't', 'NaN') bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( GUESS STATTYPES OF (*); ) ''') bdb.execute('CREATE GENERATOR m FOR p;') bdb.execute('INITIALIZE 10 MODELS FOR m') bdb.execute('ANALYZE m FOR 2 ITERATIONS (loom);') # targeted analysis for Loom not supported. with pytest.raises(BQLError): bdb.execute(''' ANALYZE m FOR 1 ITERATION (loom; variables TTL_MDCR_SPND); ''') # progress for Loom not supported (error from cgpm). with pytest.raises(ValueError): bdb.execute(''' ANALYZE m FOR 1 ITERATION (loom; quiet); ''') # timing for Loom not supported (error from cgpm). with pytest.raises(ValueError): bdb.execute(''' ANALYZE m FOR 1 SECONDS (loom); ''') # Run a BQL query. bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF p; ''') # Make sure we can run lovecat afterwards. bdb.execute('ANALYZE m FOR 2 ITERATION (optimized);')
def test_cgpm_no_empty_categories(): with cgpm_smoke_bdb() as bdb: bdb.sql_execute('CREATE TABLE f (a, b, c);') rows = [['', '\'\'', 'nan'], [1.1, 3, ''], ['""""', 1, 1]] for row in rows: bdb.sql_execute('INSERT INTO f (a, b, c) VALUES (?,?,?)', row) bayesdb_nullify(bdb, 'f', "''") bayesdb_nullify(bdb, 'f', '""""') bayesdb_nullify(bdb, 'f', '') bdb.execute(''' CREATE POPULATION q FOR f WITH SCHEMA ( MODEL a, b, c AS NOMINAL ); ''') bdb.execute('CREATE METAMODEL h IF NOT EXISTS FOR q USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR h') category_rows = bdb.sql_execute(''' SELECT colno, value FROM bayesdb_cgpm_category; ''') # Assert that none of the categories are empty strings or NULL. expected = { 0: ['1.1'], # categories for a 1: ['1', '3'], # categories for b 2: ['nan', '1'], # categories for c } seen = { 0: [], 1: [], 2: [], } for row in category_rows: colno, value = row seen[colno].append(value) assert all(set(expected[c]) == set(seen[c]) for c in expected)
def test_cgpm_no_empty_categories(): with cgpm_smoke_bdb() as bdb: bdb.sql_execute('CREATE TABLE f (a, b, c);') rows = [['', '\'\'', 'nan'], [1.1, 3, ''], ['""""', 1, 1]] for row in rows: bdb.sql_execute('INSERT INTO f (a, b, c) VALUES (?,?,?)', row) bayesdb_nullify(bdb, 'f', "''") bayesdb_nullify(bdb, 'f', '""""') bayesdb_nullify(bdb, 'f', '') bdb.execute(''' CREATE POPULATION q FOR f WITH SCHEMA ( SET STATTYPES OF a, b, c TO NOMINAL ); ''') bdb.execute('CREATE GENERATOR IF NOT EXISTS h FOR q USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR h') category_rows = bdb.sql_execute(''' SELECT colno, value FROM bayesdb_cgpm_category; ''') # Assert that none of the categories are empty strings or NULL. expected = { 0 : ['1.1'], # categories for a 1 : ['1', '3'], # categories for b 2 : ['nan', '1'], # categories for c } seen = { 0: [], 1: [], 2: [], } for row in category_rows: colno, value = row seen[colno].append(value) assert all(set(expected[c])==set(seen[c]) for c in expected)