Пример #1
0
def test_nullify():
    with bayesdb_open(':memory:') as bdb:
        bdb.sql_execute('create table t(x,y)')
        for row in [
            ['1',''],
            ['nan','foo'],
            ['2','nan'],
            ['2','""'],
            ['', ''],
        ]:
            bdb.sql_execute('insert into t values(?,?)', row)
        assert bdb.execute('select * from t').fetchall() == [
            ('1',''),
            ('nan','foo'),
            ('2','nan'),
            ('2','""'),
            ('', ''),
        ]
        assert bayesdb_nullify(bdb, 't', '') == 3
        assert bdb.execute('select * from t').fetchall() == [
            ('1',None),
            ('nan','foo'),
            ('2','nan'),
            ('2','""'),
            (None, None),
        ]
        assert bayesdb_nullify(bdb, 't', 'nan', columns=['x']) == 1
        assert bdb.execute('select * from t').fetchall() == [
            ('1',None),
            (None,'foo'),
            ('2','nan'),
            ('2','""'),
            (None, None),
        ]
        assert bayesdb_nullify(bdb, 't', 'fnord') == 0
Пример #2
0
def loom_analyze(csv_filename):
    try:
        import loom
    except ImportError:
        pytest.skip('no loom')
        return
    with bayesdb_open(':memory:') as bdb:
        bdb = bayesdb_open(':memory:')
        bdb.execute('CREATE TABLE t FROM \'%s\'' % (csv_filename))
        bayesdb_nullify(bdb, 't', 'NaN')
        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                GUESS STATTYPES OF (*);
            )
        ''')
        bdb.execute('CREATE GENERATOR m FOR p;')
        bdb.execute('INITIALIZE 10 MODELS FOR m')
        bdb.execute('ANALYZE m FOR 2 ITERATIONS (loom);')

        # targeted analysis for Loom not supported.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE m FOR 1 ITERATION (loom; variables TTL_MDCR_SPND);
            ''')
        # progress for Loom not supported (error from cgpm).
        with pytest.raises(ValueError):
            bdb.execute('''
                ANALYZE m FOR 1 ITERATION (loom; quiet);
            ''')
        # timing for Loom not supported  (error from cgpm).
        with pytest.raises(ValueError):
            bdb.execute('''
                ANALYZE m FOR 1 SECONDS (loom);
            ''')
        # Run a BQL query.
        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF p;
        ''')
        # Make sure we can run lovecat afterwards.
        bdb.execute('ANALYZE m FOR 2 ITERATION (optimized);')
Пример #3
0
def test_cgpm_no_empty_categories():
    with cgpm_smoke_bdb() as bdb:
        bdb.sql_execute('CREATE TABLE f (a, b, c);')
        rows = [['', '\'\'', 'nan'], [1.1, 3, ''], ['""""', 1, 1]]
        for row in rows:
            bdb.sql_execute('INSERT INTO f (a, b, c) VALUES (?,?,?)', row)
        bayesdb_nullify(bdb, 'f', "''")
        bayesdb_nullify(bdb, 'f', '""""')
        bayesdb_nullify(bdb, 'f', '')
        bdb.execute('''
            CREATE POPULATION q FOR f WITH SCHEMA (
                MODEL a, b, c AS NOMINAL
            );
        ''')
        bdb.execute('CREATE METAMODEL h IF NOT EXISTS FOR q USING cgpm;')
        bdb.execute('INITIALIZE 1 MODEL FOR h')
        category_rows = bdb.sql_execute('''
            SELECT colno, value FROM bayesdb_cgpm_category;
        ''')
        # Assert that none of the categories are empty strings or NULL.
        expected = {
            0: ['1.1'],  # categories for a
            1: ['1', '3'],  # categories for b
            2: ['nan', '1'],  # categories for c
        }
        seen = {
            0: [],
            1: [],
            2: [],
        }
        for row in category_rows:
            colno, value = row
            seen[colno].append(value)
        assert all(set(expected[c]) == set(seen[c]) for c in expected)
Пример #4
0
def test_cgpm_no_empty_categories():
    with cgpm_smoke_bdb() as bdb:
        bdb.sql_execute('CREATE TABLE f (a, b, c);')
        rows = [['', '\'\'', 'nan'], [1.1, 3, ''], ['""""', 1, 1]]
        for row in rows:
            bdb.sql_execute('INSERT INTO f (a, b, c) VALUES (?,?,?)', row)
        bayesdb_nullify(bdb, 'f', "''")
        bayesdb_nullify(bdb, 'f', '""""')
        bayesdb_nullify(bdb, 'f', '')
        bdb.execute('''
            CREATE POPULATION q FOR f WITH SCHEMA (
                SET STATTYPES OF a, b, c TO NOMINAL
            );
        ''')
        bdb.execute('CREATE GENERATOR IF NOT EXISTS h FOR q USING cgpm;')
        bdb.execute('INITIALIZE 1 MODEL FOR h')
        category_rows = bdb.sql_execute('''
            SELECT colno, value FROM bayesdb_cgpm_category;
        ''')
        # Assert that none of the categories are empty strings or NULL.
        expected = {
            0 : ['1.1'],       # categories for a
            1 : ['1', '3'],    # categories for b
            2 : ['nan', '1'],  # categories for c
        }
        seen = {
            0: [],
            1: [],
            2: [],
        }
        for row in category_rows:
            colno, value = row
            seen[colno].append(value)
        assert all(set(expected[c])==set(seen[c]) for c in expected)