Exemplo n.º 1
0
def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('''
            create population p for t(id ignore; model x,y as numerical)
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g1) == [-1, 1, 2]
Exemplo n.º 2
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_backends=False) as bdb:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
        bayeslite.bayesdb_register_backend(bdb, backend)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb, 'hospitals_full', 'dha',
            overrides=[('name', 'key')])
        bayesdb_guess_population(bdb, 'hospitals_sub', 'dha',
            overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm;
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm(
                SUBSAMPLE 100
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)')
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE
            FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF
            N_DEATH_ILL FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc
            FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE
            FROM PAIRWISE hospitals_sub
            WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND
            (r1._rowid_ = 1 OR r1._rowid_ = 101)
        ''').fetchall()
        bdb.execute('''
            INFER mdcr_spnd_amblnc FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        sql = '''
            SELECT table_rowid FROM bayesdb_cgpm_individual
                WHERE generator_id = ?
                ORDER BY cgpm_rowid ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full,))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid,))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
Exemplo n.º 3
0
def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100))
        bdb.execute('''
            create population p for t(
                id ignore;
                set stattypes of x,y to numerical;
            )
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
Exemplo n.º 4
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb,
                                 'hospitals_full',
                                 'dha',
                                 overrides=[('name', 'key')])
        bayesdb_guess_population(bdb,
                                 'hospitals_sub',
                                 'dha',
                                 overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING crosscat (
                SUBSAMPLE(OFF)
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING crosscat (
                SUBSAMPLE(100)
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc'
                    ' FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE hospitals_sub'
                    ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND'
                    ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall()
        bdb.execute('INFER mdcr_spnd_amblnc FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        sql = '''
            SELECT sql_rowid FROM bayesdb_crosscat_subsample
                WHERE generator_id = ?
                ORDER BY cc_row_id ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full, ))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid, ))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
Exemplo n.º 5
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR dhacc_full FOR dha USING crosscat (
                SUBSAMPLE(OFF),
                GUESS(*),
                name KEY
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR dhacc FOR dha USING crosscat (
                SUBSAMPLE(100),
                GUESS(*),
                name KEY
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR dhacc')
        bdb.execute('ANALYZE dhacc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM dhacc'
            ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM dhacc'
            ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc'
            ' FROM dhacc WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE dhacc'
            ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND'
                ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall()
        bdb.execute('INFER mdcr_spnd_amblnc FROM dhacc'
            ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        sql = '''
            SELECT sql_rowid FROM bayesdb_crosscat_subsample
                WHERE generator_id = ?
                ORDER BY cc_row_id ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, 'dhacc_full')
        cursor = bdb.sql_execute(sql, (gid_full,))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, 'dhacc')
        cursor = bdb.sql_execute(sql, (gid,))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR dhacc')
        bdb.execute('DROP GENERATOR dhacc_full')
Exemplo n.º 6
0
def bayesdb_population(mkbdb,
                       tab,
                       pop,
                       gen,
                       table_schema,
                       data,
                       columns,
                       backend_name='cgpm'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qp = bql_quote_name(pop)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(backend_name)
        bdb.execute('CREATE POPULATION %s FOR %s(%s)' %
                    (qp, qt, ';'.join(columns)))
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s;' % (
            qg,
            qp,
            qmm,
        ))
        population_id = core.bayesdb_get_population(bdb, pop)
        generator_id = core.bayesdb_get_generator(bdb, population_id, gen)
        yield bdb, population_id, generator_id
Exemplo n.º 7
0
def test_bayesdb_implicit_population_generator_rename():
    with bayesdb() as bdb:
        # Create table t.
        bdb.sql_execute('create table t (a real, b ignore, c real)')
        # Create an implicit population.
        bdb.execute('create population for t (guess (*))')
        population_id = core.bayesdb_get_population(bdb, 't')
        # Create an implicit generator.
        bdb.execute('create generator for t')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 't')
        # Fail to rename to implicit generator directly.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('alter generator t rename to gaz')
        # Fail to rename to implicit population directly.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('alter population t rename to paz')
        # Rename base table t.
        bdb.execute('alter table t rename to t2')
        # Check population name propagated.
        population_id2 = core.bayesdb_get_population(bdb, 't2')
        assert not core.bayesdb_has_population(bdb, 't')
        assert core.bayesdb_has_population(bdb, 't2')
        assert population_id2 == population_id
        # Check the generator name propagated.
        generator_id2 = core.bayesdb_get_population(bdb, 't2')
        assert not core.bayesdb_has_generator(bdb, population_id, 't')
        assert core.bayesdb_has_generator(bdb, population_id2, 't2')
        assert generator_id2 == generator_id
Exemplo n.º 8
0
def test_bayesdb_implicit_population_generator_rename():
    with bayesdb() as bdb:
        # Create table t.
        bdb.sql_execute('create table t (a real, b ignore, c real)')
        # Create an implicit population.
        bdb.execute('create population for t (guess (*))')
        population_id = core.bayesdb_get_population(bdb, 't')
        # Create an implicit generator.
        bdb.execute('create generator for t')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 't')
        # Fail to rename to implicit generator directly.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('alter generator t rename to gaz')
        # Fail to rename to implicit population directly.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('alter population t rename to paz')
        # Rename base table t.
        bdb.execute('alter table t rename to t2')
        # Check population name propagated.
        population_id2 = core.bayesdb_get_population(bdb, 't2')
        assert not core.bayesdb_has_population(bdb, 't')
        assert core.bayesdb_has_population(bdb, 't2')
        assert population_id2 == population_id
        # Check the generator name propagated.
        generator_id2 = core.bayesdb_get_population(bdb, 't2')
        assert not core.bayesdb_has_generator(bdb, population_id, 't')
        assert core.bayesdb_has_generator(bdb, population_id2, 't2')
        assert generator_id2 == generator_id
Exemplo n.º 9
0
def num_models_in_bdb(bdb, generator):
    generator_id = bayesdb_get_generator(bdb, generator)
    q = '''SELECT COUNT(*), MAX(modelno)
        FROM bayesdb_generator_model WHERE generator_id = ?'''
    (ct, maximum) = bdb.execute(q, (generator_id,)).next()
    assert ct == maximum+1, "Expected models to be numbered sequentially"
    return ct
Exemplo n.º 10
0
def num_models_in_bdb(bdb, generator):
    generator_id = bayesdb_get_generator(bdb, generator)
    q = '''SELECT COUNT(*), MAX(modelno)
        FROM bayesdb_generator_model WHERE generator_id = ?'''
    (ct, maximum) = bdb.execute(q, (generator_id, )).next()
    assert ct == maximum + 1, "Expected models to be numbered sequentially"
    return ct
Exemplo n.º 11
0
def test_crosscat_constraints():
    class FakeEngine(crosscat.LocalEngine.LocalEngine):
        def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y,
                                              Q):
            self._last_Y = Y
            sup = super(FakeEngine, self)
            return sup.simple_predictive_probability_multistate(
                M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)

        def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).simple_predictive_sample(seed=seed,
                                                                    M_c=M_c,
                                                                    X_L=X_L,
                                                                    X_D=X_D,
                                                                    Y=Y,
                                                                    Q=Q,
                                                                    n=n)

        def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).impute_and_confidence(seed=seed,
                                                                 M_c=M_c,
                                                                 X_L=X_L,
                                                                 X_D=X_D,
                                                                 Y=Y,
                                                                 Q=Q,
                                                                 n=n)

    engine = FakeEngine(seed=0)
    mm = CrosscatMetamodel(engine)
    with bayesdb(metamodel=mm) as bdb:
        t1_schema(bdb)
        t1_data(bdb)
        bdb.execute('''
            CREATE GENERATOR t1_cc FOR t1 USING crosscat(
                label CATEGORICAL,
                age NUMERICAL,
                weight NUMERICAL
            )
        ''')
        gid = core.bayesdb_get_generator(bdb, 't1_cc')
        assert core.bayesdb_generator_column_number(bdb, gid, 'label') == 1
        assert core.bayesdb_generator_column_number(bdb, gid, 'age') == 2
        assert core.bayesdb_generator_column_number(bdb, gid, 'weight') == 3
        from bayeslite.metamodels.crosscat import crosscat_cc_colno
        assert crosscat_cc_colno(bdb, gid, 1) == 0
        assert crosscat_cc_colno(bdb, gid, 2) == 1
        assert crosscat_cc_colno(bdb, gid, 3) == 2
        bdb.execute('INITIALIZE 1 MODEL FOR t1_cc')
        bdb.execute('ANALYZE t1_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE PROBABILITY OF age = 8 GIVEN (weight = 16)'
                    ' BY t1_cc').next()
        assert engine._last_Y == [(28, 2, 16)]
        bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next()
        bdb.execute("INFER age FROM t1_cc WHERE label = 'baz'").next()
        assert engine._last_Y == [(3, 0, 1), (3, 2, 32)]
        bdb.execute('SIMULATE weight FROM t1_cc GIVEN age = 8 LIMIT 1').next()
        assert engine._last_Y == [(28, 1, 8)]
Exemplo n.º 12
0
def test_crosscat_constraints():
    class FakeEngine(crosscat.LocalEngine.LocalEngine):
        def predictive_probability_multistate(self, M_c, X_L_list,
                X_D_list, Y, Q):
            self._last_Y = Y
            sup = super(FakeEngine, self)
            return sup.simple_predictive_probability_multistate(M_c=M_c,
                X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)
        def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).simple_predictive_sample(seed=seed,
                M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n)
        def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).impute_and_confidence(seed=seed,
                M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n)
    engine = FakeEngine(seed=0)
    mm = CrosscatMetamodel(engine)
    with bayesdb(metamodel=mm) as bdb:
        t1_schema(bdb)
        t1_data(bdb)
        bdb.execute('''
            CREATE POPULATION p1 FOR t1 (
                id IGNORE;
                label CATEGORICAL;
                age NUMERICAL;
                weight NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR p1_cc FOR p1 USING crosscat(
                label CATEGORICAL,
                age NUMERICAL,
                weight NUMERICAL
            )
        ''')
        pid = core.bayesdb_get_population(bdb, 'p1')
        assert core.bayesdb_variable_number(bdb, pid, None, 'label') == 1
        assert core.bayesdb_variable_number(bdb, pid, None, 'age') == 2
        assert core.bayesdb_variable_number(bdb, pid, None, 'weight') == 3
        gid = core.bayesdb_get_generator(bdb, pid, 'p1_cc')
        from bayeslite.metamodels.crosscat import crosscat_cc_colno
        assert crosscat_cc_colno(bdb, gid, 1) == 0
        assert crosscat_cc_colno(bdb, gid, 2) == 1
        assert crosscat_cc_colno(bdb, gid, 3) == 2
        bdb.execute('INITIALIZE 1 MODEL FOR p1_cc')
        bdb.execute('ANALYZE p1_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE PROBABILITY DENSITY OF age = 8'
            ' GIVEN (weight = 16)'
            ' BY p1').next()
        assert engine._last_Y == [(28, 2, 16)]
        bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next()
        bdb.execute("INFER age FROM p1 WHERE label = 'baz'").next()
        assert engine._last_Y == [(3, 0, 1), (3, 2, 32)]
        bdb.execute('SIMULATE weight FROM p1 GIVEN age = 8 LIMIT 1').next()
        assert engine._last_Y == [(28, 1, 8)]
        # Simulate with an unknown nominal value should throw an error.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('SIMULATE weight FROM p1 GIVEN label = \'q\' LIMIT 1;')
Exemplo n.º 13
0
def test_insert():
    with test_csv.bayesdb_csv_stream(test_csv.csv_data) as (bdb, f):
        bayeslite.bayesdb_read_csv(bdb, "t", f, header=True, create=True)
        guess.bayesdb_guess_generator(bdb, "t_cc", "t", "crosscat")
        bdb.execute("initialize 2 models for t_cc")
        bdb.execute("analyze t_cc for 1 iteration wait")
        generator_id = core.bayesdb_get_generator(bdb, "t_cc")
        row = (41, "F", 96000, 73, "data science", 2)
        bqlfn.bayesdb_insert(bdb, generator_id, row)
Exemplo n.º 14
0
def test_insert():
    with test_csv.bayesdb_csv_stream(test_csv.csv_data) as (bdb, f):
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True)
        guess.bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat')
        bdb.execute('initialize 2 models for t_cc')
        bdb.execute('analyze t_cc for 1 iteration wait')
        generator_id = core.bayesdb_get_generator(bdb, 't_cc')
        row = (41, 'F', 96000, 73, 'data science', 2)
        bqlfn.bayesdb_insert(bdb, generator_id, row)
Exemplo n.º 15
0
def test_insert():
    with test_csv.bayesdb_csv_stream(test_csv.csv_data) as (bdb, f):
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True)
        guess.bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat')
        bdb.execute('initialize 2 models for t_cc')
        bdb.execute('analyze t_cc for 1 iteration wait')
        generator_id = core.bayesdb_get_generator(bdb, 't_cc')
        row = (41, 'F', 96000, 73, 'data science', 2)
        bqlfn.bayesdb_insert(bdb, generator_id, row)
Exemplo n.º 16
0
def bayesdb_generator(mkbdb, tab, gen, table_schema, data, columns, metamodel_name="crosscat"):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(metamodel_name)
        bdb.execute("CREATE GENERATOR %s FOR %s USING %s(%s)" % (qg, qt, qmm, ",".join(columns)))
        yield bdb, core.bayesdb_get_generator(bdb, gen)
Exemplo n.º 17
0
def bayesdb_generator(mkbdb, tab, gen, table_schema, data, columns,
        metamodel_name='crosscat'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(metamodel_name)
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
            (qg, qt, qmm, ','.join(columns)))
        yield bdb, core.bayesdb_get_generator(bdb, gen)
Exemplo n.º 18
0
def _retest_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, g)
    gid = core.bayesdb_get_generator(bdb, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)
    bdb.execute("ANALYZE %s FOR 1 ITERATION WAIT" % (qg,))
    bdb.execute("ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT" % (qg,))
    bdb.execute("ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT" % (qg,))
Exemplo n.º 19
0
def bayesdb_population(mkbdb, tab, pop, gen, table_schema, data, columns,
        backend_name='cgpm'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qp = bql_quote_name(pop)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(backend_name)
        bdb.execute('CREATE POPULATION %s FOR %s(%s)' %
            (qp, qt, ';'.join(columns)))
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s;' %
            (qg, qp, qmm,))
        population_id = core.bayesdb_get_population(bdb, pop)
        generator_id = core.bayesdb_get_generator(bdb, population_id, gen)
        yield bdb, population_id, generator_id
Exemplo n.º 20
0
def _retest_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, g)
    gid = core.bayesdb_get_generator(bdb, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg, ))
Exemplo n.º 21
0
def test_crosscat_constraints():
    class FakeEngine(crosscat.LocalEngine.LocalEngine):
        def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y, Q):
            self._last_Y = Y
            sup = super(FakeEngine, self)
            return sup.simple_predictive_probability_multistate(M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)

        def simple_predictive_sample(self, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).simple_predictive_sample(M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n)

        def impute_and_confidence(self, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).impute_and_confidence(M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n)

    engine = FakeEngine(seed=0)
    mm = CrosscatMetamodel(engine)
    with bayesdb(metamodel=mm) as bdb:
        t1_schema(bdb)
        t1_data(bdb)
        bdb.execute(
            """
            CREATE GENERATOR t1_cc FOR t1 USING crosscat(
                label CATEGORICAL,
                age NUMERICAL,
                weight NUMERICAL
            )
        """
        )
        gid = core.bayesdb_get_generator(bdb, "t1_cc")
        assert core.bayesdb_generator_column_number(bdb, gid, "label") == 1
        assert core.bayesdb_generator_column_number(bdb, gid, "age") == 2
        assert core.bayesdb_generator_column_number(bdb, gid, "weight") == 3
        from bayeslite.metamodels.crosscat import crosscat_cc_colno

        assert crosscat_cc_colno(bdb, gid, 1) == 0
        assert crosscat_cc_colno(bdb, gid, 2) == 1
        assert crosscat_cc_colno(bdb, gid, 3) == 2
        bdb.execute("INITIALIZE 1 MODEL FOR t1_cc")
        bdb.execute("ANALYZE t1_cc FOR 1 ITERATION WAIT")
        bdb.execute("ESTIMATE PROBABILITY OF age = 8 GIVEN (weight = 16)" " BY t1_cc").next()
        assert engine._last_Y == [(28, 2, 16)]
        bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next()
        bdb.execute("INFER age FROM t1_cc WHERE label = 'baz'").next()
        assert engine._last_Y == [(3, 0, 1), (3, 2, 32)]
        bdb.execute("SIMULATE weight FROM t1_cc GIVEN age = 8 LIMIT 1").next()
        assert engine._last_Y == [(28, 1, 8)]
Exemplo n.º 22
0
def bdb():
    bdb = bayesdb_open(':memory:')

    # Create the population of complements.
    bdb.sql_execute('CREATE TABLE t (a TEXT, b TEXT)')
    for _ in xrange(20):
        bdb.sql_execute('INSERT INTO t (a, b) VALUES (0,1)')
    for _ in xrange(20):
        bdb.sql_execute('INSERT INTO t (a, b) VALUES (1,0)')

    # Create the population and metamodel on the existing rows.
    bdb.execute('CREATE POPULATION p FOR t (MODEL a, b AS NOMINAL)')
    bdb.execute('CREATE METAMODEL m FOR p;')
    bdb.execute('INITIALIZE 1 MODELS FOR m;')
    bdb.execute('ANALYZE m FOR 1000 ITERATION WAIT (OPTIMIZED);')

    # Add new 'hypothetical' rows into the base table to serve as out-of-
    # sample probe points; only zeros, only ones, and nothing.
    for _ in xrange(40, 50):
        bdb.sql_execute('INSERT INTO t (a) VALUES (0)')
    for _ in xrange(50, 60):
        bdb.sql_execute('INSERT INTO t (b) VALUES (1)')
    for _ in xrange(60, 80):
        bdb.sql_execute('INSERT INTO t (a,b) VALUES (NULL, NULL)')

    # Make sure fresh_row_id 80 from the base table, not metamodel.
    population_id = bayesdb_get_population(bdb, 'p')
    assert bayesdb_population_fresh_row_id(bdb, population_id) == 81

    # Make sure the cgpm only has 40 rowids incorporated.
    generator_id = bayesdb_get_generator(bdb, population_id, 'm')
    cursor = bdb.sql_execute(
        '''
        SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
        WHERE generator_id = ?
    ''', (generator_id, ))
    assert cursor_value(cursor) == 40

    # Turn off multiprocessing for sequence of queries.
    bdb.metamodels['cgpm'].set_multiprocess(False)
    return bdb
Exemplo n.º 23
0
def bdb():
    bdb = bayesdb_open(':memory:')

    # Create the population of complements.
    bdb.sql_execute('CREATE TABLE t (a TEXT, b TEXT)')
    for _ in xrange(20):
        bdb.sql_execute('INSERT INTO t (a, b) VALUES (0,1)')
    for _ in xrange(20):
        bdb.sql_execute('INSERT INTO t (a, b) VALUES (1,0)')

    # Create the population and generator on the existing rows.
    bdb.execute('CREATE POPULATION p FOR t (SET STATTYPES OF a, b TO NOMINAL)')
    bdb.execute('CREATE GENERATOR m FOR p;')
    bdb.execute('INITIALIZE 1 MODELS FOR m;')
    bdb.execute('ANALYZE m FOR 1000 ITERATION (OPTIMIZED);')

    # Add new 'hypothetical' rows into the base table to serve as out-of-
    # sample probe points; only zeros, only ones, and nothing.
    for _ in xrange(40, 50):
        bdb.sql_execute('INSERT INTO t (a) VALUES (0)')
    for _ in xrange(50, 60):
        bdb.sql_execute('INSERT INTO t (b) VALUES (1)')
    for _ in xrange(60, 80):
        bdb.sql_execute('INSERT INTO t (a,b) VALUES (NULL, NULL)')

    # Make sure fresh_row_id 80 from the base table, not generator.
    population_id = bayesdb_get_population(bdb, 'p')
    assert bayesdb_population_fresh_row_id(bdb, population_id) == 81

    # Make sure the cgpm only has 40 rowids incorporated.
    generator_id = bayesdb_get_generator(bdb, population_id, 'm')
    cursor = bdb.sql_execute('''
        SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
        WHERE generator_id = ?
    ''', (generator_id,))
    assert cursor_value(cursor) == 40

    # Turn off multiprocessing for sequence of queries.
    bdb.backends['cgpm'].set_multiprocess(False)
    return bdb
Exemplo n.º 24
0
def _retest_example(bdb, exname):
    (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
        cleanup) = examples[exname]
    qg = bql_quote_name(g)

    backend = be()
    bayeslite.bayesdb_register_backend(bdb, backend)
    p_id = core.bayesdb_get_population(bdb, p)

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)

    bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg,))
    try:
        # Test analyzing models.
        bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg,))
        bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg,))
    except bayeslite.BQLError, e:
        # loom does not allow model numbers to be specified in analyze models
        assert exname == 'loom'
Exemplo n.º 25
0
def _retest_example(bdb, exname):
    (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
     cleanup) = examples[exname]
    qg = bql_quote_name(g)

    metamodel = mm()
    bayeslite.bayesdb_register_backend(bdb, mm())
    p_id = core.bayesdb_get_population(bdb, p)

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)

    bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg, ))
    try:
        # Test analyzing models.
        bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg, ))
        bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg, ))
    except bayeslite.BQLError, e:
        # loom does not allow model numbers to be specified in analyze models
        assert exname == 'loom'
Exemplo n.º 26
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as generator: %s' %
                    (repr(phrase.name), ))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as table: %s' %
                    (repr(phrase.name), ))
            if not core.bayesdb_has_generator_default(
                    bdb, phrase.simulation.generator):
                raise BQLError(
                    bdb,
                    'No such generator: %s' % (phrase.simulation.generator, ))
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            out.write(', ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.modelno,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = \
                [(core.bayesdb_generator_column_number(bdb, generator_id, name),
                        value)
                    for (name, _expression), value in
                        zip(phrase.simulation.constraints, cursor[0][2:])]
            colnos = \
                [core.bayesdb_generator_column_number(bdb, generator_id, name)
                    for name in column_names]
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join(
                     '%s %s' % (qcn, column_sqltypes[casefold(column_name)])
                     for qcn, column_name in zip(qcns, column_names))))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(bdb,
                                              generator_id,
                                              constraints,
                                              colnos,
                                              modelno=modelno,
                                              numpredictions=nsamples):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(
                    bdb, 'Table still in use by generators: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp) or \
                              core.bayesdb_has_generator(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(
                            bdb,
                            'No such generator: %s' % (repr(cmd.generator), ))
                    generator_id = core.bayesdb_get_generator(
                        bdb, cmd.generator)
                    bayesdb_schema_required(bdb, 6, "generator defaults")
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                    set_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(set_default_sql, (generator_id, ))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(
                bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), ))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:

                def instantiate(columns):
                    return instantiate_generator(bdb,
                                                 phrase.name,
                                                 phrase.table,
                                                 metamodel,
                                                 columns,
                                                 default=phrase.default)

                metamodel.create_generator(bdb, phrase.table, phrase.schema,
                                           instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos,
                                        model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Exemplo n.º 27
0
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname,
        create=False, ifnotexists=False, gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict((casefold(column_name),
                             casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table),))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id,))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                        (repr(generator),))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg,))
                bayesdb_create_legacy_generator(bdb, generator, table,
                    column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator),))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:      # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                    (modelno,))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                    ': no view_state in X_L[%u]' %
                    (modelno,))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                        ' in view state %u of X_L[%u]' % (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                        ': non-list for view %u columns in X_L[%u]'
                        % (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                            (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id,))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(insert_model_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'iterations': iterations,
            })
            bdb.sql_execute(insert_theta_json_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'theta_json': json.dumps(theta),
            })
Exemplo n.º 28
0
def _test_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert not core.bayesdb_has_generator(bdb, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)

    gid = core.bayesdb_get_generator(bdb, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, ))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, ))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt, ))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt, ))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    assert gid == core.bayesdb_get_generator(bdb, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        bdb.execute('DROP MODEL 1 FROM %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert not core.bayesdb_generator_has_model(bdb, gid, 1)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)

    # Test analyzing models.
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg, ))
Exemplo n.º 29
0
def test_add_drop_models():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(
            bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION p FOR satellites_ucs WITH SCHEMA(
                GUESS STATTYPES OF (*);
            )
        ''')
        bdb.execute('CREATE GENERATOR m FOR p (SUBSAMPLE 10);')

        # Retrieve id for testing.
        population_id = bayesdb_get_population(bdb, 'p')
        generator_id = bayesdb_get_generator(bdb, population_id, 'm')

        def check_modelno_mapping(lookup):
            pairs = bdb.sql_execute('''
                SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno
                WHERE generator_id = ?
            ''', (generator_id,))
            for pair in pairs:
                assert lookup[pair[0]] == pair[1]
                del lookup[pair[0]]
            assert len(lookup) == 0

        # Initialize some models.
        bdb.execute('INITIALIZE 16 MODELS FOR m')
        # Assert identity mapping initially.
        check_modelno_mapping({i:i for i in xrange(16)})

        bdb.execute('ANALYZE m FOR 1 ITERATION (QUIET);')

        # Drop some models.
        bdb.execute('DROP MODELS 1, 8-12, 14 FROM m')
        # Assert cgpm models are contiguous while bayesdb models are not, with
        # the mapping preserving the strict order.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
        })

        # Run some analysis again.
        bdb.execute('ANALYZE m FOR 1 ITERATION (OPTIMIZED; QUIET);')

        # Initialize 14 models if not existing.
        bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m')
        # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note
        # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14
        # MODELS in total will exist after the query, rather it will initialize
        # any non-existing modelnos with index 0-13, and any modelnos > 14
        # (modelno 15 in this test case) are untouched.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
            # Recreated models.
            1: 9,
            8: 10,
            9: 11,
            10: 12,
            11: 13,
            12: 14,
        })

        # Drop some more models, add them back with some more, and confirm
        # arithmetic and ordering remains correct.
        bdb.execute('DROP MODELS 0-1 FROM m')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
        })
        bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
            # Re-recreated models.
            0: 13,
            1: 14,
            # New models.
            14: 15,
            16: 16,
            17: 17,
            18: 18,
            19: 19,
        })

        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODELS 20-50 FROM m')
        # Drop all models.
        bdb.execute('DROP MODELS FROM m;')
        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODEL 0 FROM m')
        # Assert cgpm mapping is cleared.
        cursor = bdb.sql_execute('''
            SELECT COUNT(*) FROM bayesdb_cgpm_modelno
            WHERE generator_id = ?
        ''', (generator_id,))
        assert cursor_value(cursor) == 0
Exemplo n.º 30
0
def _test_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert not core.bayesdb_has_generator(bdb, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)

    gid = core.bayesdb_get_generator(bdb, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt,))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt,))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    assert gid == core.bayesdb_get_generator(bdb, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        bdb.execute('DROP MODEL 1 FROM %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert not core.bayesdb_generator_has_model(bdb, gid, 1)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)

    # Test analyzing models.
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg,))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg,))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg,))
Exemplo n.º 31
0
    def _cmd_render_crosscat(self, query, sql=None, **kwargs):
        '''Returns a rendering of the specified crosscat state

        Usage: .render_crosscat [options] <generator> <modelno>.

        Options:
            --subsample=<n>
            --width=<w>
            --height=<c>
            --rowlabels=<colname>
            --progress=[True|False]
            --yticklabeslize=<fontsize>
            --xticklabeslize=<fontsize>

        The allowable fontsize strings are:
            xx-small, x-small, # small, medium, large, x-large, xx-large
        '''
        tokens = query.split()
        if len(tokens) != 2:
            self.write_stderr('Usage: .render_crosscat <generator> <modelno>')
            return
        generator = tokens[0]
        modelno = int(tokens[1])
        if not bayesdb_has_generator(self._bdb, None, generator):
            self.write_stderr('No such generator: %s.' % (generator, ))
            return
        generator_id = bayesdb_get_generator(self._bdb, None, generator)
        population_id = bayesdb_generator_population(self._bdb, generator_id)
        backend = bayesdb_generator_backend(self._bdb, generator_id)
        if backend.name() != 'cgpm':
            self.write_stderr('.render_crosscat requires generator from the '
                              'cgpm backend')
            return
        engine = backend._engine(self._bdb, generator_id)
        cursor = self._bdb.sql_execute(
            '''
            SELECT cgpm_modelno FROM bayesdb_cgpm_modelno
            WHERE generator_id = ? AND modelno = ?
        ''', (
                generator_id,
                modelno,
            ))
        cgpm_modelno = cursor_value(cursor, nullok=True)
        if cgpm_modelno is None:
            self.write_stderr('No such model number: %d.' % (modelno, ))
            return
        state = engine.get_state(cgpm_modelno)
        row_names = None
        row_index_column = kwargs.get('rowlabels', None)
        if row_index_column is not None:
            table_name = bayesdb_generator_table(self._bdb, generator_id)
            qt = bql_quote_name(table_name)
            qc = bql_quote_name(row_index_column)
            cursor = self._bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid IN (
                    SELECT table_rowid FROM bayesdb_cgpm_individual
                    WHERE generator_id = ?
                )
            ''' % (qc, qt), (generator_id, ))
            row_names = [c[0] for c in cursor]
        if 'progress' in kwargs:
            sys.stdout.write('Creating figure...\n')
        import cgpm.utils.render
        if 'variable' not in kwargs:
            # Plot the entire state.
            col_names = [
                bayesdb_variable_name(self._bdb, population_id, None, colno)
                for colno in state.outputs
            ]
            fig, _ax = cgpm.utils.render.viz_state(state,
                                                   col_names=col_names,
                                                   row_names=row_names,
                                                   **kwargs)
        else:
            # Plot the view of the requested variable.
            varno = bayesdb_variable_number(self._bdb, population_id,
                                            generator_id, kwargs['variable'])
            view = state.view_for(varno)
            col_names = [
                bayesdb_variable_name(self._bdb, population_id, None, colno)
                for colno in view.outputs[1:]
            ]
            fig, _ax = cgpm.utils.render.viz_view(view,
                                                  col_names=col_names,
                                                  row_names=row_names,
                                                  **kwargs)
        (width, height) = fig.get_size_inches()
        if 'width' in kwargs:
            width = float(kwargs['width'])
            fig.set_size_inches(width, height)
        if 'height' in kwargs:
            height = float(kwargs['height'])
            fig.set_size_inches(width, height)
        if 'progress' in kwargs:
            sys.stdout.write('Rendering figure...\n')
Exemplo n.º 32
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSimModels):
        assert isinstance(phrase.simulation, ast.SimulateModels)
        with bdb.savepoint():
            # Check if table exists.
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                raise BQLError(
                    bdb,
                    'Name already defined as table: %s' % (phrase.name),
                )
            # Set up schema and create the new table.
            qn = sqlite3_quote_name(phrase.name)
            qcns = map(sqlite3_quote_name, [
                simcol.name if simcol.name is not None else str(simcol.col)
                for simcol in phrase.simulation.columns
            ])
            temp = '' if phrase.temp is None else 'TEMP'
            bdb.sql_execute('''
                CREATE %s TABLE %s (%s)
            ''' % (temp, qn, str.join(',', qcns)))
            # Retrieve the rows.
            rows = simulate_models_rows(bdb, phrase.simulation)
            # Insert the rows into the table.
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in rows:
                bdb.sql_execute(insert_sql, row)
            return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        schema = guess.guess_to_schema(guess.bayesdb_guess_stattypes, bdb,
                                       phrase.table)
        # Print schema to console, so user can edit it and/or copy/paste it into
        # the schema definition when creating a population.
        print schema
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            if core.bayesdb_population_generators(bdb, population_id):
                raise BQLError(
                    bdb,
                    'Population still has generators: %r' % (phrase.name, ))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb, 'Cannot update statistical types '
                            'for population %s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population'
                            ': %s' % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type'
                            ': %r' % (repr(cmd.stattype), ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Exemplo n.º 33
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_backends=False) as bdb:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
        bayeslite.bayesdb_register_backend(bdb, backend)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb,
                                 'hospitals_full',
                                 'dha',
                                 overrides=[('name', 'key')])
        bayesdb_guess_population(bdb,
                                 'hospitals_sub',
                                 'dha',
                                 overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm;
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm(
                SUBSAMPLE 100
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)')
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE
            FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF
            N_DEATH_ILL FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc
            FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE
            FROM PAIRWISE hospitals_sub
            WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND
            (r1._rowid_ = 1 OR r1._rowid_ = 101)
        ''').fetchall()
        bdb.execute('''
            INFER mdcr_spnd_amblnc FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        sql = '''
            SELECT table_rowid FROM bayesdb_cgpm_individual
                WHERE generator_id = ?
                ORDER BY cgpm_rowid ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full, ))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid, ))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
Exemplo n.º 34
0
    def dot_describe(self, line):
        '''describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        '''
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe population(s) [<pop>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
            return
        if casefold(tokens[0]) == 'table' or \
           casefold(tokens[0]) == 'tables':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                qualifier = \
                    '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')'
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write('No such table: %s\n' %
                                          (repr(table), ))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = '''
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            ''' % (qualifier, )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) in ('population', 'populations'):
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1, )
                                 for i in xrange(len(params)))
                qualifier = '(name IN (%s))' % (names, )
                ok = True
                for population in params:
                    if not core.bayesdb_has_population(self._bdb, population):
                        self.stdout.write('No such population: %s\n' %
                                          (repr(population), ))
                        ok = False
                if not ok:
                    return
            with self._bdb.savepoint():
                cursor = self._bdb.sql_execute(
                    '''
                    SELECT id, name, tabname
                        FROM bayesdb_population
                        WHERE %s
                ''' % (qualifier, ), params)
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'generator' or \
                casefold(tokens[0]) == 'generators':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1, ) for i in range(len(params)))
                qualifier = '''
                    (name IN ({names}))
                '''.format(names=names)
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator(self._bdb, None,
                                                      generator):
                        self.stdout.write('No such generator: %s\n' %
                                          (repr(generator), ))
                        ok = False
                if not ok:
                    return
            sql = '''
                SELECT id, name, tabname, backend
                    FROM bayesdb_generator
                    WHERE %s
            ''' % (qualifier, )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout,
                                 self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == 'variables':
            if len(tokens) != 2:
                self.stdout.write('Usage: .describe variables <population>\n')
                return
            population = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_population(self._bdb, population):
                    self.stdout.write('No such population: %r\n' %
                                      (population, ))
                    return
                population_id = core.bayesdb_get_population(
                    self._bdb, population)
                sql = '''
                    SELECT c.colno AS colno, c.name AS name,
                            v.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_population AS p,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_variable AS v
                                USING (colno))
                        WHERE p.id = ? AND p.id = v.population_id
                            AND p.tabname = c.tabname
                        ORDER BY colno ASC;
                '''
                cursor = self._bdb.sql_execute(sql, (population_id, ))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'model' or \
                casefold(tokens[0]) == 'models':
            if len(tokens) < 2:
                self.stdout.write('Describe models of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator(self._bdb, None, generator):
                    self.stdout.write('No such generator: %s\n' %
                                      (repr(generator), ))
                    return
                generator_id = core.bayesdb_get_generator(
                    self._bdb, None, generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = '1'
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write('Invalid model number: %s\n' %
                                              (repr(token), ))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(
                                    self._bdb, generator_id, modelno):
                                self.stdout.write('No such model: %d\n' %
                                                  (modelno, ))
                                return
                            modelnos.append(modelno)
                    qualifier = 'modelno IN (%s)' % \
                        (','.join(map(str, modelnos),))
                sql = '''
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                ''' % (qualifier, )
                cursor = self._bdb.sql_execute(sql, (generator_id, ))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
Exemplo n.º 35
0
 def create_generator(self, bdb, table, schema, instantiate):
     # Parse the schema.
     (columns, lcols, _fcols, fcol_to_pcols, fcol_to_fpred,
         dependencies) = self.parse(schema)
     # Instantiate **this** generator.
     genid, bdbcolumns = instantiate(columns.items())
     # Create internal crosscat generator. The name will be the same as
     # this generator name, with a _cc suffix.
     SUFFIX = '_cc'
     cc_name = bayeslite.core.bayesdb_generator_name(bdb, genid) + SUFFIX
     # Create strings for crosscat schema.
     cc_cols = ','.join('{} {}'.format(quote(c), quote(columns[c]))
                        for c in lcols)
     cc_dep = []
     for dep, colnames in dependencies:
         qcns = ','.join(map(quote, colnames))
         if dep:
             cc_dep.append('DEPENDENT({})'.format(qcns))
         else:
             cc_dep.append('INDEPENDENT({})'.format(qcns))
     bql = """
         CREATE GENERATOR {} FOR {} USING crosscat(
             {}, {}
         );
     """.format(quote(cc_name), quote(table), cc_cols, ','.join(cc_dep))
     bdb.execute(bql)
     # Convert strings to column numbers.
     fcolno_to_pcolnos = {}
     for f in fcol_to_pcols:
         fcolno = core.bayesdb_generator_column_number(bdb, genid, f)
         fcolno_to_pcolnos[fcolno] = [core.bayesdb_generator_column_number(
             bdb, genid, col) for col in fcol_to_pcols[f]]
     with bdb.savepoint():
         # Save internal cc generator id.
         bdb.sql_execute('''
             INSERT INTO bayesdb_composer_cc_id
                 (generator_id, crosscat_generator_id) VALUES (?,?)
         ''', (genid, core.bayesdb_get_generator(bdb, cc_name),))
         # Save lcols/fcolnos.
         for colno, _, _ in bdbcolumns:
             local = colno not in fcolno_to_pcolnos
             bdb.sql_execute('''
                 INSERT INTO bayesdb_composer_column_owner
                     (generator_id, colno, local) VALUES (?,?,?)
             ''', (genid, colno, int(local),))
         # Save parents of foreign columns.
         for fcolno in fcolno_to_pcolnos:
             for pcolno in fcolno_to_pcolnos[fcolno]:
                 bdb.sql_execute('''
                     INSERT INTO bayesdb_composer_column_parents
                         (generator_id, fcolno, pcolno) VALUES (?,?,?)
                 ''', (genid, fcolno, pcolno,))
         # Save topological order.
         topo = self.topological_sort(fcolno_to_pcolnos)
         for position, (colno, _) in enumerate(topo):
             bdb.sql_execute('''
                 INSERT INTO bayesdb_composer_column_toposort
                     (generator_id, colno, position) VALUES (?,?,?)
                 ''', (genid, colno, position,))
         # Save predictor names of foreign columns.
         for fcolno in fcolno_to_pcolnos:
             fp_name = fcol_to_fpred[casefold(
                 core.bayesdb_generator_column_name(bdb,genid, fcolno))]
             bdb.sql_execute('''
                 INSERT INTO bayesdb_composer_column_foreign_predictor
                     (generator_id, colno, predictor_name) VALUES (?,?,?)
             ''', (genid, fcolno, casefold(fp_name)))
Exemplo n.º 36
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(0, len(countries))]
                    name = 'sat-%s-%d' % (
                        country, bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute('''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')
Exemplo n.º 37
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(
                        0, len(countries))]
                    name = 'sat-%s-%d' % (country,
                                          bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute(
                        '''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')
Exemplo n.º 38
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            if not core.bayesdb_has_population(bdb,
                                               phrase.simulation.population):
                raise BQLError(
                    bdb, 'No such population: %s' %
                    (phrase.simulation.population, ))
            population_id = core.bayesdb_get_population(
                bdb, phrase.simulation.population)
            generator_id = None
            if phrase.simulation.generator is not None:
                if not core.bayesdb_has_generator(bdb, population_id,
                                                  phrase.simulation.generator):
                    raise BQLError(
                        bdb, 'No such generator: %r' %
                        (phrase.simulation.generator, ))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.simulation.generator)
            table = core.bayesdb_population_table(bdb, population_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such variable'
                        ' in population %r: %s' %
                        (phrase.simulation.population, column_name))
            for column_name, _expression in phrase.simulation.constraints:
                cn = casefold(column_name)
                if (cn not in column_sqltypes
                        and cn not in core.bayesdb_rowid_tokens(bdb)):
                    raise BQLError(
                        bdb, 'No such variable in population %s: %s' %
                        (phrase.simulation.population, column_name))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)

            def map_var(var):
                if casefold(var) not in core.bayesdb_rowid_tokens(bdb):
                    return core.bayesdb_variable_number(
                        bdb, population_id, generator_id, var)
                else:
                    return casefold(var)

            def map_constraint(((var, _expression), value)):
                return (map_var(var), value)

            constraints = map(
                map_constraint,
                zip(phrase.simulation.constraints, cursor[0][1:]))
            colnos = map(map_var, column_names)
            schema = ','.join('%s %s' %
                              (qcn, column_sqltypes[casefold(column_name)])
                              for qcn, column_name in zip(qcns, column_names))
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, schema))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(
                    bdb,
                    population_id,
                    constraints,
                    colnos,
                    generator_id=generator_id,
                    numpredictions=nsamples,
                    accuracy=phrase.simulation.accuracy):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)
Exemplo n.º 39
0
def test_bayesdb_implicit_population_generator():
    with bayesdb() as bdb:
        # Create table t.
        bdb.sql_execute('create table t (a real, b ignore, c real)')
        # Create an implicit population.
        bdb.execute('create population for t (guess (*));')
        population_id = core.bayesdb_get_population(bdb, 't')
        # Confirm table t has an implicit population.
        assert core.bayesdb_table_has_implicit_population(bdb, 't')
        # Confirm population t is implicit.
        assert core.bayesdb_population_is_implicit(bdb, population_id)
        # Fail to create a new implicit population.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('create population for t (guess (*))')
        # Fail to create a new named population.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create population p for t (guess (*))')
        # Check ifnotexists phrase.
        bdb.execute('create population if not exists for t (guess (*))')
        # Create an implicit generator.
        bdb.execute('create generator for t')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 't')
        # Confirm population t has an implicit generator.
        assert core.bayesdb_population_has_implicit_generator(
            bdb, population_id)
        # Confirm generator t is implicit.
        assert core.bayesdb_generator_is_implicit(bdb, generator_id)
        # Fail to create a new implicit generator.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('create generator for t')
        # Fail to create a new named generator.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create generator p for t')
        # Check ifnotexists phrase.
        bdb.execute('create generator if not exists for t')
        # Drop the implicit generator.
        bdb.execute('drop generator t')
        assert not \
            core.bayesdb_population_has_implicit_generator(bdb, population_id)
        # Create named generator.
        bdb.execute('create generator g0 for t')
        generator_id0 = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert not core.bayesdb_population_has_implicit_generator(
            bdb, population_id)
        assert not core.bayesdb_generator_is_implicit(bdb, generator_id0)
        # Fail to create implicit generator.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create generator for t')
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create generator if not exists for t')
        # Succeed in creating named generator.
        bdb.execute('create generator g1 for t')
        generator_id1 = core.bayesdb_get_generator(bdb, population_id, 'g1')
        assert not core.bayesdb_generator_is_implicit(bdb, generator_id1)
        # Drop the named generators.
        bdb.execute('drop generator g0')
        bdb.execute('drop generator g1')
        # Drop the population.
        bdb.execute('drop population t')
        # Create named population.
        bdb.execute('create population p0 for t (guess (*))')
        population_id0 = core.bayesdb_get_population(bdb, 'p0')
        assert not core.bayesdb_table_has_implicit_population(bdb, 't')
        assert not core.bayesdb_population_is_implicit(bdb, population_id0)
        # Fail to create implicit population.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create population for t (guess (*))')
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create population if not exists for t (guess (*))')
        # Succeed in creating named population.
        bdb.execute('create population p1 for t (guess (*))')
        population_id1 = core.bayesdb_get_population(bdb, 'p1')
        assert not core.bayesdb_table_has_implicit_population(bdb, 't')
        assert not core.bayesdb_population_is_implicit(bdb, population_id1)
        # Drop the named populations.
        bdb.execute('drop population p0')
        bdb.execute('drop population p1')
Exemplo n.º 40
0
def instantiate_generator(bdb,
                          gen_name,
                          table,
                          metamodel,
                          columns,
                          default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(
            bdb, 'Name already defined as table: %s' % (repr(gen_name), ))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = '''INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)'''
        cursor = bdb.sql_execute(
            generator_sql, {
                'name': gen_name,
                'table': table,
                'metamodel': metamodel.name(),
                'defaultp': default,
            })
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {
                'stattype': stattype,
            })
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb, 'No such columns in table %s: %s' %
            (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(
            bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), ))
    if invalid:
        raise BQLError(
            bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), ))

    if not generator_already_existed:
        # Insert column records.
        column_sql = '''
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        '''
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(
                column_sql, {
                    'generator_id': generator_id,
                    'colno': colno,
                    'stattype': stattype,
                })

    column_list = sorted((column_map[casefold(name)], name, stattype)
                         for name, stattype in columns)
    return generator_id, column_list
Exemplo n.º 41
0
def simulate_models_rows(bdb, simulation):
    assert all(isinstance(c, ast.SelColExp) for c in simulation.columns)
    population_id = core.bayesdb_get_population(
        bdb, simulation.population)
    generator_id = None
    if simulation.generator is not None:
        if not core.bayesdb_has_generator(
                bdb, population_id, simulation.generator):
            raise BQLError(bdb, 'No such generator: %r' %
                (simulation.generator,))
        generator_id = core.bayesdb_get_generator(
            bdb, population_id, simulation.generator)
    def retrieve_literal(expression):
        assert isinstance(expression, ast.ExpLit)
        lit = expression.value
        if isinstance(lit, ast.LitNull):
            return None
        elif isinstance(lit, ast.LitInt):
            return lit.value
        elif isinstance(lit, ast.LitFloat):
            return lit.value
        elif isinstance(lit, ast.LitString):
            return lit.value
        else:
            assert False
    def retrieve_variable(var):
        if not core.bayesdb_has_variable(
                bdb, population_id, generator_id, var):
            raise BQLError(bdb, 'No such population variable: %s' % (var,))
        return core.bayesdb_variable_number(
            bdb, population_id, generator_id, var)
    def simulate_column(exp):
        if isinstance(exp, ast.ExpCol):
            # XXX This is wrong -- it returns independent samples from
            # the marginals of each variable, not one sample from the
            # joint on all variables.
            if False:
                raise BQLError(bdb,
                    'SIMULATE FROM MODELS OF can\'t sample conditional')
                # XXX Gotta weight each model by probability of
                # constraints.
                constraints = [
                    (retrieve_variable(v), retrieve_literal(e))
                    for v, e in simulation.constraints
                ]
            else:
                constraints = []
            colnos = [retrieve_variable(exp.column)]
            accuracy = 1        # XXX Allow nontrivial accuracy?
            samples = bqlfn.bayesdb_simulate(
                bdb, population_id, constraints, colnos,
                generator_id=generator_id, numpredictions=1,
                accuracy=accuracy)
            return [sample[0] for sample in samples]
        elif isinstance(exp, ast.ExpBQLDepProb):
            raise BQLError(bdb,
                'DEPENDENCE PROBABILITY simulation still unsupported.')
        elif isinstance(exp, ast.ExpBQLProbDensity):
            raise BQLError(bdb,
                'PROBABILITY DENSITY OF simulation still unsupported.')
        elif isinstance(exp, ast.ExpBQLMutInf):
            colnos0 = [retrieve_variable(c) for c in exp.columns0]
            colnos1 = [retrieve_variable(c) for c in exp.columns1]
            constraint_args = ()
            if exp.constraints is not None:
                constraint_args = tuple(itertools.chain.from_iterable([
                    [retrieve_variable(colname), retrieve_literal(expr)]
                    for colname, expr in exp.constraints
                ]))
            nsamples = exp.nsamples and retrieve_literal(exp.nsamples)
            # One mi_list per generator of the population.
            #
            # XXX fsaad@20170625: Setting modelnos = None arbitrarily, figure
            # out how to set the modelnos argument.
            mi_lists = bqlfn._bql_column_mutual_information(
                bdb, population_id, generator_id, None, colnos0, colnos1,
                nsamples, *constraint_args)
            return list(itertools.chain.from_iterable(mi_lists))
        else:
            raise BQLError(bdb,
                'Only constants can be simulated: %s.' % (simulation,))
    columns = [simulate_column(c.expression) for c in simulation.columns]
    # All queries must return the same number of rows, equal to the number of
    # models of all generators implied by the query.
    assert all(len(column) == len(columns[0]) for column in columns)
    # Convert the columns into rows.
    return zip(*columns)
Exemplo n.º 42
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = "TEMP " if phrase.temp else ""
            ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else ""
            out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),))
            if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator):
                raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,))
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write("SELECT ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out)
            out.write(", ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(", ")
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = [
                (core.bayesdb_generator_column_number(bdb, generator_id, name), value)
                for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])
            ]
            colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names]
            bdb.sql_execute(
                "CREATE %sTABLE %s%s (%s)"
                % (
                    "TEMP " if phrase.temp else "",
                    "IF NOT EXISTS " if phrase.ifnotexists else "",
                    qn,
                    ",".join(
                        "%s %s" % (qcn, column_sqltypes[casefold(column_name)])
                        for qcn, column_name in zip(qcns, column_names)
                    ),
                )
            )
            insert_sql = """
                INSERT INTO %s (%s) VALUES (%s)
            """ % (
                qn,
                ",".join(qcns),
                ",".join("?" for qcn in qcns),
            )
            for row in bqlfn.bayesdb_simulate(
                bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples
            ):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?"
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),))
            bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,))
            ifexists = "IF EXISTS " if phrase.ifexists else ""
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, "No such table: %s" % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + "_temp"
                        while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp):
                            temp += "_temp"
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError("Renaming columns" " not yet implemented.")
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table, cmd.old):
                            raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new))
                            )
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = """
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new})
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = """
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        """
                        cursor = bdb.sql_execute(generators_sql, (table,))
                        for (generator_id,) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id, old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),))
                    generator_id = core.bayesdb_get_generator(bdb, cmd.generator)
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                    set_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(set_default_sql, (generator_id,))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                else:
                    assert False, "Invalid alter table command: %s" % (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():

            def instantiate(columns):
                return instantiate_generator(
                    bdb,
                    phrase.name,
                    phrase.table,
                    metamodel,
                    columns,
                    ifnotexists=phrase.ifnotexists,
                    default=phrase.default,
                )

            metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = """
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            """
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = """
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            """
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = """
                DELETE FROM bayesdb_generator WHERE id = ?
            """
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb, "No such generator: %s" % (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = """
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_generator_sql, (cmd.name, generator_id))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(
                    modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
            else:
                existing = set(
                    modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
                if 0 < len(existing):
                    raise BQLError(
                        bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing))
                    )

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = """
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            """
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0})

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos, model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError("No background analysis -- use WAIT.")
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(
            bdb,
            generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
        )
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = """
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno})
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno))
                        )
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = """
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                """
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = """
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno})
        return empty_cursor(bdb)

    assert False  # XXX
Exemplo n.º 43
0
def simulate_models_rows(bdb, simulation):
    assert all(isinstance(c, ast.SimCol) for c in simulation.columns)
    population_id = core.bayesdb_get_population(bdb, simulation.population)
    generator_id = None
    if simulation.generator is not None:
        if not core.bayesdb_has_generator(bdb, population_id,
                                          simulation.generator):
            raise BQLError(bdb,
                           'No such generator: %r' % (simulation.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                  simulation.generator)

    def retrieve_literal(expression):
        assert isinstance(expression, ast.ExpLit)
        lit = expression.value
        if isinstance(lit, ast.LitNull):
            return None
        elif isinstance(lit, ast.LitInt):
            return lit.value
        elif isinstance(lit, ast.LitFloat):
            return lit.value
        elif isinstance(lit, ast.LitString):
            return lit.value
        else:
            assert False

    def retrieve_variable(var):
        if not core.bayesdb_has_variable(bdb, population_id, generator_id,
                                         var):
            raise BQLError(bdb, 'No such population variable: %s' % (var, ))
        return core.bayesdb_variable_number(bdb, population_id, generator_id,
                                            var)

    def simulate_column(phrase):
        if isinstance(phrase, ast.ExpBQLDepProb):
            raise BQLError(
                bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.')
        elif isinstance(phrase, ast.ExpBQLProb):
            raise BQLError(bdb, 'PROBABILITY OF simulation still unsupported.')
        elif isinstance(phrase, ast.ExpBQLMutInf):
            colno0 = retrieve_variable(phrase.column0)
            colno1 = retrieve_variable(phrase.column1)
            constraint_args = ()
            if phrase.constraints is not None:
                constraint_args = tuple(
                    itertools.chain.from_iterable(
                        [[retrieve_variable(colname),
                          retrieve_literal(expr)]
                         for colname, expr in phrase.constraints]))
            nsamples = phrase.nsamples and retrieve_literal(phrase.nsamples)
            # One mi_list per generator of the population.
            mi_lists = bqlfn._bql_column_mutual_information(
                bdb, population_id, generator_id, colno0, colno1, nsamples,
                *constraint_args)
            return list(itertools.chain.from_iterable(mi_lists))
        else:
            raise BQLError(
                bdb, 'Only constants can be simulated: %s.' % (simulation, ))

    columns = [simulate_column(c.col) for c in simulation.columns]
    # All queries must return the same number of rows, equal to the number of
    # models of all generators implied by the query.
    assert all(len(column) == len(columns[0]) for column in columns)
    # Convert the columns into rows.
    return zip(*columns)
Exemplo n.º 44
0
def simulate_models_rows(bdb, simulation):
    assert all(isinstance(c, ast.SelColExp) for c in simulation.columns)
    population_id = core.bayesdb_get_population(bdb, simulation.population)
    generator_id = None
    if simulation.generator is not None:
        if not core.bayesdb_has_generator(bdb, population_id,
                                          simulation.generator):
            raise BQLError(bdb,
                           'No such generator: %r' % (simulation.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                  simulation.generator)

    def retrieve_literal(expression):
        assert isinstance(expression, ast.ExpLit)
        lit = expression.value
        if isinstance(lit, ast.LitNull):
            return None
        elif isinstance(lit, ast.LitInt):
            return lit.value
        elif isinstance(lit, ast.LitFloat):
            return lit.value
        elif isinstance(lit, ast.LitString):
            return lit.value
        else:
            assert False

    def retrieve_variable(var):
        if not core.bayesdb_has_variable(bdb, population_id, generator_id,
                                         var):
            raise BQLError(bdb, 'No such population variable: %s' % (var, ))
        return core.bayesdb_variable_number(bdb, population_id, generator_id,
                                            var)

    def simulate_column(exp):
        if isinstance(exp, ast.ExpCol):
            # XXX This is wrong -- it returns independent samples from
            # the marginals of each variable, not one sample from the
            # joint on all variables.
            if False:
                raise BQLError(
                    bdb, 'SIMULATE FROM MODELS OF can\'t sample conditional')
                # XXX Gotta weight each model by probability of
                # constraints.
                constraints = [(retrieve_variable(v), retrieve_literal(e))
                               for v, e in simulation.constraints]
            else:
                constraints = []
            colnos = [retrieve_variable(exp.column)]
            accuracy = 1  # XXX Allow nontrivial accuracy?
            samples = bqlfn.bayesdb_simulate(bdb,
                                             population_id,
                                             constraints,
                                             colnos,
                                             generator_id=generator_id,
                                             numpredictions=1,
                                             accuracy=accuracy)
            return [sample[0] for sample in samples]
        elif isinstance(exp, ast.ExpBQLDepProb):
            raise BQLError(
                bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.')
        elif isinstance(exp, ast.ExpBQLProbDensity):
            raise BQLError(
                bdb, 'PROBABILITY DENSITY OF simulation still unsupported.')
        elif isinstance(exp, ast.ExpBQLMutInf):
            colnos0 = [retrieve_variable(c) for c in exp.columns0]
            colnos1 = [retrieve_variable(c) for c in exp.columns1]
            constraint_args = ()
            if exp.constraints is not None:
                constraint_args = tuple(
                    itertools.chain.from_iterable(
                        [[retrieve_variable(colname),
                          retrieve_literal(expr)]
                         for colname, expr in exp.constraints]))
            nsamples = exp.nsamples and retrieve_literal(exp.nsamples)
            # One mi_list per generator of the population.
            #
            # XXX fsaad@20170625: Setting modelnos = None arbitrarily, figure
            # out how to set the modelnos argument.
            mi_lists = bqlfn._bql_column_mutual_information(
                bdb, population_id, generator_id, None, colnos0, colnos1,
                nsamples, *constraint_args)
            return list(itertools.chain.from_iterable(mi_lists))
        else:
            raise BQLError(
                bdb, 'Only constants can be simulated: %s.' % (simulation, ))

    columns = [simulate_column(c.expression) for c in simulation.columns]
    # All queries must return the same number of rows, equal to the number of
    # models of all generators implied by the query.
    assert all(len(column) == len(columns[0]) for column in columns)
    # Convert the columns into rows.
    return zip(*columns)
Exemplo n.º 45
0
def instantiate_generator(bdb, gen_name, table, metamodel, columns, default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(bdb, "Name already defined as table: %s" % (repr(gen_name),))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = """INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)"""
        cursor = bdb.sql_execute(
            generator_sql, {"name": gen_name, "table": table, "metamodel": metamodel.name(), "defaultp": default}
        )
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = """
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    """
    stattype_sql = """
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    """
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {"table": table, "column_name": name})
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {"stattype": stattype})
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, "No such columns in table %s: %s" % (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(bdb, "Duplicate column names: %s" % (repr(list(duplicates)),))
    if invalid:
        raise BQLError(bdb, "Invalid statistical types: %s" % (repr(list(invalid)),))

    if not generator_already_existed:
        # Insert column records.
        column_sql = """
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        """
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(column_sql, {"generator_id": generator_id, "colno": colno, "stattype": stattype})

    column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns)
    return generator_id, column_list
Exemplo n.º 46
0
def _test_example(bdb, exname):
    (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
        cleanup) = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    backend = be()
    bayeslite.bayesdb_register_backend(bdb, backend)

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id+1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt,))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt,))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        try:
            bdb.execute('DROP MODEL 1 FROM %s' % (qg,))
            assert core.bayesdb_generator_has_model(bdb, gid, 0)
            assert not core.bayesdb_generator_has_model(bdb, gid, 1)
            assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
        except bayeslite.BQLError, e:
           # loom does not allow model numbers to be specified in drop models
           assert exname == 'loom'
Exemplo n.º 47
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX
Exemplo n.º 48
0
def _test_example(bdb, exname):
    (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
     cleanup) = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    metamodel = mm()
    bayeslite.bayesdb_register_backend(bdb, metamodel)

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id + 1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, ))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, ))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt, ))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt, ))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        try:
            bdb.execute('DROP MODEL 1 FROM %s' % (qg, ))
            assert core.bayesdb_generator_has_model(bdb, gid, 0)
            assert not core.bayesdb_generator_has_model(bdb, gid, 1)
            assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
        except bayeslite.BQLError, e:
            # loom does not allow model numbers to be specified in drop models
            assert exname == 'loom'
Exemplo n.º 49
0
def test_loom_complex_add_analyze_drop_sequence():
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t (x)')
            for x in xrange(10):
                bdb.sql_execute('insert into t (x) values (?)', (x, ))
            bdb.execute('create population p for t (x numerical)')
            bdb.execute('create generator g for p using loom')

            bdb.execute('initialize 2 models for g')

            bdb.execute('initialize 3 models if not exists for g')
            population_id = bayesdb_get_population(bdb, 'p')
            generator_id = bayesdb_get_generator(bdb, population_id, 'g')
            cursor = bdb.sql_execute(
                '''
                SELECT num_models FROM bayesdb_loom_generator_model_info
                    WHERE generator_id=?;
            ''', (generator_id, ))
            num_models = cursor.fetchall()[0][0]
            # Make sure that the total number of models is
            # 3 and not 2 + 3 = 5.
            assert num_models == 3

            bdb.execute('analyze g for 10 iterations')
            bdb.execute('estimate probability density of x = 50 from p')

            with pytest.raises(BQLError):
                bdb.execute('drop model 1 from g')
            bdb.execute('drop models from g')

            bdb.execute('initialize 1 models for g')
            population_id = bayesdb_get_population(bdb, 'p')
            generator_id = bayesdb_get_generator(bdb, population_id, 'g')
            cursor = bdb.sql_execute(
                '''
                SELECT num_models FROM bayesdb_loom_generator_model_info
                    WHERE generator_id=?;
            ''', (generator_id, ))
            num_models = cursor.fetchall()[0][0]
            # Make sure that the number of models was reset after dropping.
            assert num_models == 1
            bdb.execute('analyze g for 50 iterations')

            cursor = bdb.execute('''
                estimate probability density of x = 50 from p''')
            probDensityX1 = cursor.fetchall()
            probDensityX1 = [x[0] for x in probDensityX1]
            bdb.execute('simulate x from p limit 1').fetchall()
            bdb.execute('drop models from g')

            bdb.execute('initialize 1 model for g')
            bdb.execute('analyze g for 50 iterations')
            cursor = bdb.execute('''
                estimate probability density of x = 50 from p''')
            probDensityX2 = cursor.fetchall()
            probDensityX2 = [x[0] for x in probDensityX2]
            # Check that the analysis started fresh after dropping models
            # and that it produces similar results the second time.
            for px1, px2 in zip(probDensityX1, probDensityX2):
                assert abs(px1 - px2) < .01
            bdb.execute('drop models from g')
            bdb.execute('drop generator g')
            bdb.execute('drop population p')
            bdb.execute('drop table t')
Exemplo n.º 50
0
def test_add_drop_models():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION p FOR satellites_ucs WITH SCHEMA(
                GUESS STATTYPES FOR (*);
            )
        ''')
        bdb.execute('CREATE METAMODEL m FOR p (SUBSAMPLE 10);')

        # Retrieve id for testing.
        population_id = bayesdb_get_population(bdb, 'p')
        generator_id = bayesdb_get_generator(bdb, population_id, 'm')

        def check_modelno_mapping(lookup):
            pairs = bdb.sql_execute(
                '''
                SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno
                WHERE generator_id = ?
            ''', (generator_id, ))
            for pair in pairs:
                assert lookup[pair[0]] == pair[1]
                del lookup[pair[0]]
            assert len(lookup) == 0

        # Initialize some models.
        bdb.execute('INITIALIZE 16 MODELS FOR m')
        # Assert identity mapping initially.
        check_modelno_mapping({i: i for i in xrange(16)})

        bdb.execute('ANALYZE m FOR 1 ITERATION WAIT (QUIET);')

        # Drop some models.
        bdb.execute('DROP MODELS 1, 8-12, 14 FROM m')
        # Assert cgpm models are contiguous while bayesdb models are not, with
        # the mapping preserving the strict order.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
        })

        # Run some analysis again.
        bdb.execute('ANALYZE m FOR 1 ITERATION WAIT (OPTIMIZED; QUIET);')

        # Initialize 14 models if not existing.
        bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m')
        # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note
        # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14
        # MODELS in total will exist after the query, rather it will initialize
        # any non-existing modelnos with index 0-13, and any modelnos > 14
        # (modelno 15 in this test case) are untouched.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
            # Recreated models.
            1: 9,
            8: 10,
            9: 11,
            10: 12,
            11: 13,
            12: 14,
        })

        # Drop some more models, add them back with some more, and confirm
        # arithmetic and ordering remains correct.
        bdb.execute('DROP MODELS 0-1 FROM m')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
        })
        bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
            # Re-recreated models.
            0: 13,
            1: 14,
            # New models.
            14: 15,
            16: 16,
            17: 17,
            18: 18,
            19: 19,
        })

        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODELS 20-50 FROM m')
        # Drop all models.
        bdb.execute('DROP MODELS FROM m;')
        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODEL 0 FROM m')
        # Assert cgpm mapping is cleared.
        cursor = bdb.sql_execute(
            '''
            SELECT COUNT(*) FROM bayesdb_cgpm_modelno
            WHERE generator_id = ?
        ''', (generator_id, ))
        assert cursor_value(cursor) == 0
Exemplo n.º 51
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder(
                '''
                CREATE TEMP TABLE %s (column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT)
            ''' % (qtt), ())
            for cn, st, ct in zip(column_names, stattypes,
                                  distinct_value_counts):
                out.winder(
                    '''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt, ))
            out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [
                    core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids
                ]
                raise BQLError(
                    bdb, 'Population %r still has metamodels: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'No such variable in base table: %s' % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(bdb, population_id, None,
                                                 cmd.name):
                        raise BQLError(
                            bdb,
                            'Variable already in population: %s' % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'Cannot add variable without any values: %s' %
                            (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute('SELECT %s FROM %s' %
                                                 (qc, qt))
                        rows = cursor.fetchall()
                        [stattype,
                         reason] = bayesdb_guess_stattypes([cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(
                                bdb, 'Values in column %s appear to be keys.' %
                                (cmd.name, ))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(
                                bdb, 'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name, ))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                                       'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'Numerical column contains string values: %r ' %
                            (qc, ))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(bdb, population_id, cmd.name,
                                                  stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) metamodel in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            # XXX Omit needless bayesdb_generator_column table
                            # Github issue #441.
                            bdb.sql_execute(
                                '''
                                INSERT INTO bayesdb_generator_column
                                    VALUES (:generator_id, :colno, :stattype)
                            ''', {
                                    'generator_id': generator_id,
                                    'colno': colno,
                                    'stattype': stattype,
                                })
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb,
                            'Cannot update statistical types for population '
                            '%s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population: %s' %
                            (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type: %r' %
                            (repr(cmd.stattype), ))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(
                                bdb, 'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars, ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(
                        bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(
                        bdb, 'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the metamodel.
                metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                metamodel.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the metamodel.
        generator_id = None
        if phrase.metamodel:
            if not core.bayesdb_has_generator(bdb, population_id,
                                              phrase.metamodel):
                raise BQLError(bdb,
                               'No such metamodel: %r' % (phrase.population, ))
            generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                      phrase.metamodel)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(bdb, population_id, None,
                                         phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target, ))
        colno_target = core.bayesdb_variable_number(bdb, population_id, None,
                                                    phrase.target)
        if core.bayesdb_variable_stattype(bdb, population_id, colno_target) != \
                'numerical':
            raise BQLError(
                bdb,
                'Target variable is not numerical: %r' % (phrase.target, ))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(bdb, population_id, None,
                                             given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(colno for colno in colno_givens
                                  if colno != colno_target)
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(bdb,
                                      population_id,
                                      generator_id,
                                      modelnos,
                                      constraints,
                                      colnos,
                                      numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(target_values, given_values, given_names,
                                   stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder(
            '''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt, ), ())
        for variable, coef in coefficients:
            out.winder(
                '''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (
                    variable,
                    coef,
                ))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt, ))
        out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    assert False  # XXX
Exemplo n.º 52
0
def test_bayesdb_implicit_population_generator():
    with bayesdb() as bdb:
        # Create table t.
        bdb.sql_execute('create table t (a real, b ignore, c real)')
        # Create an implicit population.
        bdb.execute('create population for t (guess (*));')
        population_id = core.bayesdb_get_population(bdb, 't')
        # Confirm table t has an implicit population.
        assert core.bayesdb_table_has_implicit_population(bdb, 't')
        # Confirm population t is implicit.
        assert core.bayesdb_population_is_implicit(bdb, population_id)
        # Fail to create a new implicit population.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('create population for t (guess (*))')
        # Fail to create a new named population.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create population p for t (guess (*))')
        # Check ifnotexists phrase.
        bdb.execute('create population if not exists for t (guess (*))')
        # Create an implicit generator.
        bdb.execute('create generator for t')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 't')
        # Confirm population t has an implicit generator.
        assert core.bayesdb_population_has_implicit_generator(
            bdb, population_id)
        # Confirm generator t is implicit.
        assert core.bayesdb_generator_is_implicit(bdb, generator_id)
        # Fail to create a new implicit generator.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('create generator for t')
        # Fail to create a new named generator.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create generator p for t')
        # Check ifnotexists phrase.
        bdb.execute('create generator if not exists for t')
        # Drop the implicit generator.
        bdb.execute('drop generator t')
        assert not \
            core.bayesdb_population_has_implicit_generator(bdb, population_id)
        # Create named generator.
        bdb.execute('create generator g0 for t')
        generator_id0 = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert not core.bayesdb_population_has_implicit_generator(
            bdb, population_id)
        assert not core.bayesdb_generator_is_implicit(bdb, generator_id0)
        # Fail to create implicit generator.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create generator for t')
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create generator if not exists for t')
        # Succeed in creating named generator.
        bdb.execute('create generator g1 for t')
        generator_id1 = core.bayesdb_get_generator(bdb, population_id, 'g1')
        assert not core.bayesdb_generator_is_implicit(bdb, generator_id1)
        # Drop the named generators.
        bdb.execute('drop generator g0')
        bdb.execute('drop generator g1')
        # Drop the population.
        bdb.execute('drop population t')
        # Create named population.
        bdb.execute('create population p0 for t (guess (*))')
        population_id0 = core.bayesdb_get_population(bdb, 'p0')
        assert not core.bayesdb_table_has_implicit_population(bdb, 't')
        assert not core.bayesdb_population_is_implicit(bdb, population_id0)
        # Fail to create implicit population.
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create population for t (guess (*))')
        with pytest.raises(apsw.ConstraintError):
            bdb.execute('create population if not exists for t (guess (*))')
        # Succeed in creating named population.
        bdb.execute('create population p1 for t (guess (*))')
        population_id1 = core.bayesdb_get_population(bdb, 'p1')
        assert not core.bayesdb_table_has_implicit_population(bdb, 't')
        assert not core.bayesdb_population_is_implicit(bdb, population_id1)
        # Drop the named populations.
        bdb.execute('drop population p0')
        bdb.execute('drop population p1')
Exemplo n.º 53
0
    def dot_describe(self, line):
        '''describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        '''
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe population(s) [<pop>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
            return
        if casefold(tokens[0]) == 'table' or \
           casefold(tokens[0]) == 'tables':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                qualifier = \
                    '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')'
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write('No such table: %s\n' %
                                          (repr(table),))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = '''
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            ''' % (qualifier,)
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) in ('population', 'populations'):
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1,) for i in xrange(len(params)))
                qualifier = '(name IN (%s))' % (names,)
                ok = True
                for population in params:
                    if not core.bayesdb_has_population(self._bdb, population):
                        self.stdout.write('No such population: %s\n' %
                            (repr(population),))
                        ok = False
                if not ok:
                    return
            with self._bdb.savepoint():
                cursor = self._bdb.sql_execute('''
                    SELECT id, name, tabname
                        FROM bayesdb_population
                        WHERE %s
                ''' % (qualifier,), params)
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'generator' or \
                casefold(tokens[0]) == 'generators':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1,) for i in range(len(params)))
                qualifier = '''
                    (name IN ({names}))
                '''.format(names=names)
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator(
                            self._bdb, None, generator):
                        self.stdout.write('No such generator: %s\n' %
                            (repr(generator),))
                        ok = False
                if not ok:
                    return
            sql = '''
                SELECT id, name, tabname, backend
                    FROM bayesdb_generator
                    WHERE %s
            ''' % (qualifier,)
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout,
                    self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == 'variables':
            if len(tokens) != 2:
                self.stdout.write('Usage: .describe variables <population>\n')
                return
            population = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_population(self._bdb, population):
                    self.stdout.write('No such population: %r\n' %
                        (population,))
                    return
                population_id = core.bayesdb_get_population(
                    self._bdb, population)
                sql = '''
                    SELECT c.colno AS colno, c.name AS name,
                            v.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_population AS p,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_variable AS v
                                USING (colno))
                        WHERE p.id = ? AND p.id = v.population_id
                            AND p.tabname = c.tabname
                        ORDER BY colno ASC;
                '''
                cursor = self._bdb.sql_execute(sql, (population_id,))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'model' or \
                casefold(tokens[0]) == 'models':
            if len(tokens) < 2:
                self.stdout.write('Describe models of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator(self._bdb, None, generator):
                    self.stdout.write('No such generator: %s\n' %
                        (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator(
                    self._bdb, None, generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = '1'
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write('Invalid model number: %s\n' %
                                (repr(token),))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(
                                    self._bdb, generator_id, modelno):
                                self.stdout.write('No such model: %d\n' %
                                    (modelno,))
                                return
                            modelnos.append(modelno)
                    qualifier = 'modelno IN (%s)' % \
                        (','.join(map(str, modelnos),))
                sql = '''
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                ''' % (qualifier,)
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
Exemplo n.º 54
0
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
Exemplo n.º 55
0
def bayesdb_load_legacy_models(bdb,
                               generator,
                               table,
                               metamodel,
                               pathname,
                               create=False,
                               ifnotexists=False,
                               gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict(
        (casefold(column_name), casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table), ))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                                 (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id, ))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                                     (repr(generator), ))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg, ))
                bayesdb_create_legacy_generator(bdb, generator, table,
                                                column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                                            column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator), ))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:  # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                              (modelno, ))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                              ': no view_state in X_L[%u]' % (modelno, ))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                                  ' in view state %u of X_L[%u]' %
                                  (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                                  ': non-list for view %u columns in X_L[%u]' %
                                  (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                                      (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id, ))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(
                insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                    'iterations': iterations,
                })
            bdb.sql_execute(
                insert_theta_json_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                    'theta_json': json.dumps(theta),
                })