示例#1
0
def bdb_for_checking_cmi(backend, iterations, seed):
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:', seed=seed) as bdb:
            bdb.sql_execute('CREATE TABLE t (a, b, c)')
            for row in generate_v_structured_data(1000, bdb.np_prng):
                bdb.sql_execute('''
                    INSERT INTO t (a, b, c) VALUES (?, ?, ?)
                ''', row)

            bdb.execute('''
                CREATE POPULATION p FOR t WITH SCHEMA (
                    SET STATTYPES OF a, b, c TO NOMINAL;
                )
            ''')
            if backend == 'loom':
                try:
                    from bayeslite.backends.loom_backend import LoomBackend
                except ImportError:
                    pytest.skip('Failed to import Loom.')
                bayesdb_register_backend(
                    bdb, LoomBackend(loom_store_path=loom_store_path))
                bdb.execute('CREATE GENERATOR m FOR p using loom')
            elif backend == 'cgpm':
                bdb.execute('CREATE GENERATOR m FOR p using cgpm')
                bdb.backends['cgpm'].set_multiprocess('on')
            else:
                raise ValueError('Backend %s unknown' % (backend,))
            # XXX we may want to downscale this eventually.
            bdb.execute('INITIALIZE 10 MODELS FOR m;')
            bdb.execute('ANALYZE m FOR %d ITERATIONS;' % (iterations,))
            if backend == 'cgpm':
                bdb.backends['cgpm'].set_multiprocess('off')
            yield bdb
示例#2
0
def smoke_loom():
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            try:
                from bayeslite.backends.loom_backend import LoomBackend
            except ImportError:
                pytest.skip('Failed to import Loom.')
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)')

            for a, b, c, d, e in itertools.product(*([range(2)]*4+[['x','y']])):
                # XXX Insert synthetic data generator here.
                bdb.sql_execute('''
                    INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?)
                ''', (a, b, c, d, e))

            bdb.execute('''
                CREATE POPULATION p FOR t WITH SCHEMA (
                    SET STATTYPES OF a, b, c, d TO NUMERICAL;
                    SET STATTYPES OF e TO NOMINAL
                )
            ''')

            bdb.execute('CREATE GENERATOR m FOR p using loom;')
            bdb.execute('INITIALIZE 1 MODELS FOR m;')

            yield bdb
def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('''
            create population p for t(
                id ignore;
                set stattypes of x,y to numerical;
            )
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
def test_loom_guess_schema_nominal():
    """Test to make sure that LoomBackend handles the case where the user
    provides a nominal variable with more than 256 distinct values. In this
    case, Loom automatically specifies the unbounded_nominal type.
    """
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t (v)')
            vals_to_insert = []
            for i in xrange(300):
                word = ""
                for _j in xrange(20):
                    letter_index = bdb._prng.weakrandom_uniform(
                        len(string.letters))
                    word += string.letters[letter_index]
                vals_to_insert.append(word)
            for i in xrange(len(vals_to_insert)):
                bdb.sql_execute(
                    '''
                    insert into t (v) values (?)
                ''', (vals_to_insert[i], ))

            bdb.execute('create population p for t (v nominal)')
            bdb.execute('create generator g for p using loom')
            bdb.execute('initialize 1 model for g')
            bdb.execute('analyze g for 50 iterations')
            bdb.execute('drop models from g')
            bdb.execute('drop generator g')
            bdb.execute('drop population p')
            bdb.execute('drop table t')
示例#5
0
def cgpm_smoke_bdb():
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        registry = {
            'piecewise': PieceWise,
        }
        bayesdb_register_backend(
            bdb, CGPM_Backend(registry, multiprocess=0))

        bdb.sql_execute('CREATE TABLE t (Output, cat, Input)')
        for i in xrange(3):
            for j in xrange(3):
                for k in xrange(3):
                    output = i + j/(k + 1)
                    cat = -1 if (i + j*k) % 2 else +1
                    input = (i*j - k)**2
                    if i % 2:
                        output = None
                    if j % 2:
                        cat = None
                    if k % 2:
                        input = None
                    bdb.sql_execute('''
                        INSERT INTO t (output, cat, input) VALUES (?, ?, ?)
                    ''', (output, cat, input))

        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                output  NUMERICAL;
                input   NUMERICAL;
                cat     NOMINAL;
            )
        ''')

        yield bdb
示例#6
0
def bdb_for_checking_cmi(backend, iterations, seed):
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:', seed=seed) as bdb:
            bdb.sql_execute('CREATE TABLE t (a, b, c)')
            for row in generate_v_structured_data(1000, bdb.np_prng):
                bdb.sql_execute(
                    '''
                    INSERT INTO t (a, b, c) VALUES (?, ?, ?)
                ''', row)

            bdb.execute('''
                CREATE POPULATION p FOR t WITH SCHEMA (
                    SET STATTYPES OF a, b, c TO NOMINAL;
                )
            ''')
            if backend == 'loom':
                try:
                    from bayeslite.backends.loom_backend import LoomBackend
                except ImportError:
                    pytest.skip('Failed to import Loom.')
                bayesdb_register_backend(
                    bdb, LoomBackend(loom_store_path=loom_store_path))
                bdb.execute('CREATE GENERATOR m FOR p using loom')
            elif backend == 'cgpm':
                bdb.execute('CREATE GENERATOR m FOR p using cgpm')
                bdb.backends['cgpm'].set_multiprocess('on')
            else:
                raise ValueError('Backend %s unknown' % (backend, ))
            # XXX we may want to downscale this eventually.
            bdb.execute('INITIALIZE 10 MODELS FOR m;')
            bdb.execute('ANALYZE m FOR %d ITERATIONS;' % (iterations, ))
            if backend == 'cgpm':
                bdb.backends['cgpm'].set_multiprocess('off')
            yield bdb
示例#7
0
def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100))
        bdb.execute('''
            create population p for t(
                id ignore;
                set stattypes of x,y to numerical;
            )
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
示例#8
0
def smoke_loom():
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            try:
                from bayeslite.backends.loom_backend import LoomBackend
            except ImportError:
                pytest.skip('Failed to import Loom.')
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)')

            for a, b, c, d, e in itertools.product(*([range(2)] * 4 +
                                                     [['x', 'y']])):
                # XXX Insert synthetic data generator here.
                bdb.sql_execute(
                    '''
                    INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?)
                ''', (a, b, c, d, e))

            bdb.execute('''
                CREATE POPULATION p FOR t WITH SCHEMA (
                    SET STATTYPES OF a, b, c, d TO NUMERICAL;
                    SET STATTYPES OF e TO NOMINAL
                )
            ''')

            bdb.execute('CREATE GENERATOR m FOR p using loom;')
            bdb.execute('INITIALIZE 1 MODELS FOR m;')

            yield bdb
示例#9
0
def test_hackbackend():
    bdb = bayeslite.bayesdb_open(builtin_backends=False)
    bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)')
    bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')")
    bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t')
    bdb.execute('CREATE POPULATION p FOR t(b IGNORE; a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog;')
    dotdog_backend = DotdogBackend()
    bayeslite.bayesdb_register_backend(bdb, dotdog_backend)
    bayeslite.bayesdb_deregister_backend(bdb, dotdog_backend)
    bayeslite.bayesdb_register_backend(bdb, dotdog_backend)
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;')
    bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)')
    # XXX Rest of test originally exercised default backend, but
    # syntax doesn't support that now.  Not clear that's wrong either.
    bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)')
示例#10
0
def test_hackbackend():
    bdb = bayeslite.bayesdb_open(builtin_backends=False)
    bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)')
    bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')")
    bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t')
    bdb.execute('CREATE POPULATION p FOR t(b IGNORE; a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog;')
    dotdog_backend = DotdogBackend()
    bayeslite.bayesdb_register_backend(bdb, dotdog_backend)
    bayeslite.bayesdb_deregister_backend(bdb, dotdog_backend)
    bayeslite.bayesdb_register_backend(bdb, dotdog_backend)
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;')
    bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)')
    # XXX Rest of test originally exercised default backend, but
    # syntax doesn't support that now.  Not clear that's wrong either.
    bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)')
示例#11
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_backends=False) as bdb:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
        bayeslite.bayesdb_register_backend(bdb, backend)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb, 'hospitals_full', 'dha',
            overrides=[('name', 'key')])
        bayesdb_guess_population(bdb, 'hospitals_sub', 'dha',
            overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm;
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm(
                SUBSAMPLE 100
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)')
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE
            FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF
            N_DEATH_ILL FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc
            FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE
            FROM PAIRWISE hospitals_sub
            WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND
            (r1._rowid_ = 1 OR r1._rowid_ = 101)
        ''').fetchall()
        bdb.execute('''
            INFER mdcr_spnd_amblnc FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        sql = '''
            SELECT table_rowid FROM bayesdb_cgpm_individual
                WHERE generator_id = ?
                ORDER BY cgpm_rowid ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full,))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid,))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
示例#12
0
def bayesdb(backend=None, **kwargs):
    if backend is None:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
    bdb = bayeslite.bayesdb_open(builtin_backends=False, **kwargs)
    bayeslite.bayesdb_register_backend(bdb, backend)
    try:
        yield bdb
    finally:
        bdb.close()
示例#13
0
def bayesdb(backend=None, **kwargs):
    if backend is None:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
    bdb = bayeslite.bayesdb_open(builtin_backends=False, **kwargs)
    bayeslite.bayesdb_register_backend(bdb, backend)
    try:
        yield bdb
    finally:
        bdb.close()
def test_population_two_generators():
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t (x)')
            for x in xrange(10):
                bdb.sql_execute('insert into t (x) values (?)', (x, ))
            bdb.execute('create population p for t (x numerical)')
            bdb.execute('create generator g0 for p using loom')
            bdb.execute('create generator g1 for p using loom')
def test_stattypes():
    """Test of the LoomBackend on a table with all possible data types.
    Only checks for errors from Loom.
    """
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t (u, co, b, ca, cy, nu, no)')
            for _x in xrange(10):
                cat_dict = ['a', 'b', 'c']
                bdb.sql_execute(
                    '''
                    insert into t (u, co, b, ca, cy, nu, no)
                    values (?, ?, ?, ?, ?, ?, ?)''',
                    (cat_dict[bdb._prng.weakrandom_uniform(3)],
                     bdb._prng.weakrandom_uniform(200),
                     bdb._prng.weakrandom_uniform(2),
                     cat_dict[bdb._prng.weakrandom_uniform(3)],
                     bdb._prng.weakrandom_uniform(1000) / 4.0,
                     bdb._prng.weakrandom_uniform(1000) / 4.0 - 100.0,
                     bdb._prng.weakrandom_uniform(1000) / 4.0))
            bdb.execute('''create population p for t(
                u unbounded_nominal;
                co counts;
                b boolean;
                ca nominal;
                cy cyclic;
                nu numerical;
                no nominal)
            ''')
            bdb.execute('create generator g for p using loom')
            bdb.execute('initialize 1 model for g')
            bdb.execute('analyze g for 50 iterations')
            bdb.execute('''estimate probability density of
                (co=2, nu=50, u='a') by p''').fetchall()
            bdb.execute('''estimate probability density of
                (nu = 50, u='a') given (co=2) by p''').fetchall()
            with pytest.raises(Exception):
                # There seems to be an issue with encoding boolean variables
                # in LoomBackend.simulate_joint, although using b=1 in the
                # condition for simulate results in no error.
                bdb.execute('''estimate probability density of
                    (b=0) by p''').fetchall()
            bdb.execute('''simulate u, co, b, ca, cy, nu, no
                from p limit 1''').fetchall()
            bdb.execute('''simulate u, b, ca, no
                from p given nu=3, co=2, b=1 limit 1''').fetchall()
            bdb.execute('drop models from g')
            bdb.execute('drop generator g')
            bdb.execute('drop population p')
            bdb.execute('drop table t')
示例#16
0
def cgpm_dummy_satellites_pop_bdb():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            create population satellites for satellites_ucs with schema(
                apogee numerical;
                class_of_orbit nominal;
                country_of_operator nominal;
                launch_mass numerical;
                perigee numerical;
                period numerical
            )
        ''')
        backend = CGPM_Backend(dict(), multiprocess=0)
        bayesdb_register_backend(bdb, backend)
        yield bdb
示例#17
0
def test_nig_normal_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x, ))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g for p using nig_normal')
        bdb.execute('initialize 1 model for g')
        bdb.execute('analyze g for 1 iteration')
        bdb.execute('estimate probability density of x = 50 from p').fetchall()
        bdb.execute('simulate x from p limit 1').fetchall()
        bdb.execute('drop models from g')
        bdb.execute('drop generator g')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#18
0
def test_nig_normal_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x,))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g for p using nig_normal')
        bdb.execute('initialize 1 model for g')
        bdb.execute('analyze g for 1 iteration')
        bdb.execute('estimate probability density of x = 50 from p').fetchall()
        bdb.execute('simulate x from p limit 1').fetchall()
        bdb.execute('drop models from g')
        bdb.execute('drop generator g')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#19
0
def test_stattypes():
    """Test of the LoomBackend on a table with all possible data types.
    Only checks for errors from Loom.
    """
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t (u, co, b, ca, cy, nu, no)')
            for _x in xrange(10):
                cat_dict = ['a', 'b', 'c']
                bdb.sql_execute(
                    '''
                    insert into t (u, co, b, ca, cy, nu, no)
                    values (?, ?, ?, ?, ?, ?, ?)''',
                    (cat_dict[bdb._prng.weakrandom_uniform(3)],
                     bdb._prng.weakrandom_uniform(200),
                     bdb._prng.weakrandom_uniform(2),
                     cat_dict[bdb._prng.weakrandom_uniform(3)],
                     bdb._prng.weakrandom_uniform(1000) / 4.0,
                     bdb._prng.weakrandom_uniform(1000) / 4.0 - 100.0,
                     bdb._prng.weakrandom_uniform(1000) / 4.0))
            bdb.execute('''create population p for t(
                u unbounded_nominal;
                co counts;
                b boolean;
                ca nominal;
                cy cyclic;
                nu numerical;
                no nominal)
            ''')
            bdb.execute('create generator g for p using loom')
            bdb.execute('initialize 1 model for g')
            bdb.execute('analyze g for 50 iterations')
            bdb.execute('''estimate probability density of
                nu = 50, u='a' from p''').fetchall()
            bdb.execute('''simulate u, co, b, ca, cy, nu, no
                from p limit 1''').fetchall()
            bdb.execute('drop models from g')
            bdb.execute('drop generator g')
            bdb.execute('drop population p')
            bdb.execute('drop table t')
示例#20
0
 def register(self, line, cell=None):
     parser = argparse.ArgumentParser()
     parser.add_argument('backend', help='Name of backend to register.')
     parser.add_argument(
         'args',
         type=str,
         default=[],
         nargs='*',
         help='List of arguments to provide the initialization.')
     args = parser.parse_args(shlex.split(line))
     if args.backend == 'loom':
         try:
             from bayeslite.backends.loom_backend import LoomBackend
         except ImportError:
             raise ValueError('Failed to import loom backend.')
         if len(args.args) == 0:
             raise ValueError('Specify <path> for loom.')
         loom_store_path = args.args[0]
         bayesdb_register_backend(
             self._bdb, LoomBackend(loom_store_path=loom_store_path))
     else:
         raise ValueError('Unknown backend: %s' % (args.backend, ))
示例#21
0
def get_bdb(cfg, logger):
    logger.info("Using bdb file: {}".format(cfg.bdb_file))

    bdb = bayeslite.bayesdb_open(pathname=cfg.bdb_file)

    if cfg.backend == 'loom':
        bayeslite.bayesdb_register_backend(bdb, get_backend_object(cfg))
        # These are hacks that are necessary because bayeslite currently
        # assumes that `.bdb` file creation and querying will happen in the
        # same Python process.
        logger.info(
            'Backend is set to {}. Manually setting loom_store_path to {}'.
            format(cfg.backend, cfg.loom_path))
        bdb.sql_execute(
            'UPDATE bayesdb_loom_generator SET loom_store_path = ?',
            (cfg.loom_path, ))
        logger.info('Backend is set to {}. Analyzing for 1 iterations.'.format(
            cfg.backend))
        bdb.execute('ANALYZE {} FOR 1 ITERATIONS;'.format(cfg.population_name))

    logger.info("Backend registered")

    return bdb
def test_loom_one_numeric():
    """Simple test of the LoomBackend on a one variable table
    Only checks for errors from the Loom system."""
    from datetime import datetime
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t(x)')
            for x in xrange(10):
                bdb.sql_execute('insert into t (x) values (?)', (x, ))
            bdb.execute('create population p for t (x numerical)')
            bdb.execute('create generator g for p using loom')
            bdb.execute('initialize 1 models for g')
            bdb.execute('analyze g for 10 iterations')
            bdb.execute('''
                    estimate probability density of x = 50 from p
            ''').fetchall()
            bdb.execute('simulate x from p limit 1').fetchall()
            bdb.execute('drop models from g')
            bdb.execute('drop generator g')
            bdb.execute('drop population p')
            bdb.execute('drop table t')
示例#23
0
def _retest_example(bdb, exname):
    (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
        cleanup) = examples[exname]
    qg = bql_quote_name(g)

    backend = be()
    bayeslite.bayesdb_register_backend(bdb, backend)
    p_id = core.bayesdb_get_population(bdb, p)

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)

    bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg,))
    try:
        # Test analyzing models.
        bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg,))
        bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg,))
    except bayeslite.BQLError, e:
        # loom does not allow model numbers to be specified in analyze models
        assert exname == 'loom'
示例#24
0
def _retest_example(bdb, exname):
    (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
     cleanup) = examples[exname]
    qg = bql_quote_name(g)

    metamodel = mm()
    bayeslite.bayesdb_register_backend(bdb, mm())
    p_id = core.bayesdb_get_population(bdb, p)

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)

    bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg, ))
    try:
        # Test analyzing models.
        bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg, ))
        bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg, ))
    except bayeslite.BQLError, e:
        # loom does not allow model numbers to be specified in analyze models
        assert exname == 'loom'
示例#25
0
    def bayesdb(self, line, cell=None):
        parser = argparse.ArgumentParser()
        parser.add_argument('path', help='Path of bdb file.')
        parser.add_argument('-s', type=int, default=0, help='Seed.')
        parser.add_argument('-j', action='store_true', help='Multiprocessing.')
        args = parser.parse_args(line.split())
        if self._bdb is not None:
            self._bdb.close()
            self._bdb = None

        self._path = args.path
        seed = struct.pack('<QQQQ', 0, 0, 0, args.s)
        self._bdb = bayesdb_open(pathname=args.path,
                                 seed=seed,
                                 builtin_backends=False)

        # Small hack for the VsCGpm, which takes in the venturescript source
        # from %venturescript cells!
        def _VsCGpm(outputs, inputs, rng, *args, **kwds):
            if 'source' not in kwds:
                kwds['source'] = '\n'.join(self._venturescript)
            return VsCGpm(outputs, inputs, rng, *args, **kwds)

        # Register cgpm backend.
        cgpm_registry = {
            'factor_analysis': FactorAnalysis,
            'inline_venturescript': InlineVsCGpm,
            'linear_regression': LinearRegression,
            'multivariate_kde': MultivariateKde,
            'multivariate_knn': MultivariateKnn,
            'ordinary_least_squares': OrdinaryLeastSquares,
            'random_forest': RandomForest,
            'venturescript': _VsCGpm,
        }
        mm = CGPM_Backend(cgpm_registry, multiprocess=args.j)
        bayesdb_register_backend(self._bdb, mm)
        return 'Loaded: %s' % (self._path)
示例#26
0
def test_bad_analyze_vars():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee TO NUMERICAL;
                SET STATTYPE OF class_of_orbit TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass TO NUMERICAL;
                SET STATTYPE OF perigee TO NUMERICAL;
                SET STATTYPE OF period TO NUMERICAL
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_backend(bdb, CGPM_Backend(registry))
        bdb.execute('''
            CREATE GENERATOR satellites_cgpm FOR satellites USING cgpm
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION ()')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION (
                    VARIABLES period, perige
                )
            ''')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION (
                    SKIP period, perige
                )
            ''')
示例#27
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname,))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname,))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
        builtin_backends=False)

    multiprocess = args.jobs != 1
    backend = CGPM_Backend(cgpm_registry={}, multiprocess=multiprocess)
    bayeslite.bayesdb_register_backend(bdb, backend)
    bdbshell = shell.Shell(bdb, 'cgpm', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                        (str(path),))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
示例#28
0
def test_nig_normal_latent_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x,))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # observed given observed
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
                modeled by g0
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
                modeled by g1
        ''').fetchall()

        # observed given latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (xe = 50) within p
                modeled by g1
        ''').fetchall()

        # latent given observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 50 given (x = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 50 given (x = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 50 given (x = 50) within p
                modeled by g1
        ''').fetchall()

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
def register_loom(bdb):
    loom_store_path = temp_file_path('.bdb')
    loom_backend = LoomBackend(loom_store_path=loom_store_path)
    bayeslite.bayesdb_register_backend(bdb, loom_backend)
示例#30
0
def test_nig_normal_latent_2var_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')

        # CORRELATION, CORRELATION PVALUE, without generators.
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # CORRELATION, CORRELATION PVALUE, with generators.
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p modeled by g0
        ''').fetchall())
        with pytest.raises(BQLError):
            # g1 has a latent variable xe.
            assert 4 == len(
                bdb.execute('''
                estimate correlation, correlation pvalue
                    from pairwise variables of p modeled by g1
            ''').fetchall())

        # DEPENDENCE PROBABILITY, MUTUAL INFORMATION
        assert 4 == len(
            bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p
        ''').fetchall())
        assert 4 == len(
            bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p modeled by g0
        ''').fetchall())
        assert 9 == len(
            bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p modeled by g1
        ''').fetchall())

        # SIMULATE LATENT VARIABLE
        assert 10 == len(
            bdb.execute('''
            simulate xe from p modeled by g1 limit 10;
        ''').fetchall())
        assert 10 == len(
            bdb.execute('''
            simulate y, xe from p modeled by g1 limit 10;
        ''').fetchall())
        # Cannot simulate the latent xe from the population p.
        with pytest.raises(BQLError):
            assert 10 == len(
                bdb.execute('''
                simulate xe from p limit 10;
            ''').fetchall())
        # Cannot simulate the latent xe from the generator g0.
        with pytest.raises(BQLError):
            assert 10 == len(
                bdb.execute('''
                simulate xe from p modeled by g0 limit 10;
            ''').fetchall())

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#31
0
def _test_example(bdb, exname):
    (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
        cleanup) = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    backend = be()
    bayeslite.bayesdb_register_backend(bdb, backend)

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id+1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt,))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt,))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        try:
            bdb.execute('DROP MODEL 1 FROM %s' % (qg,))
            assert core.bayesdb_generator_has_model(bdb, gid, 0)
            assert not core.bayesdb_generator_has_model(bdb, gid, 1)
            assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
        except bayeslite.BQLError, e:
           # loom does not allow model numbers to be specified in drop models
           assert exname == 'loom'
示例#32
0
def _test_example(bdb, exname):
    (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
     cleanup) = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    metamodel = mm()
    bayeslite.bayesdb_register_backend(bdb, metamodel)

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id + 1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, ))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, ))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt, ))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt, ))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        try:
            bdb.execute('DROP MODEL 1 FROM %s' % (qg, ))
            assert core.bayesdb_generator_has_model(bdb, gid, 0)
            assert not core.bayesdb_generator_has_model(bdb, gid, 1)
            assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
        except bayeslite.BQLError, e:
            # loom does not allow model numbers to be specified in drop models
            assert exname == 'loom'
示例#33
0
def test_nig_normal_latent_2var_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                (x, x*x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')

        # CORRELATION, CORRELATION PVALUE, without generators.
        assert 4 == len(bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # observed given other observed
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
                modeled by g0
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
                modeled by g1
        ''').fetchall()
        bdb.execute('simulate x from p given y = 49 limit 1').fetchall()
        bdb.execute('''
            simulate x from p modeled by g0 given y = 49 limit 1
        ''').fetchall()
        bdb.execute('''
            simulate x from p modeled by g1 given y = 49 limit 1
        ''').fetchall()

        # observed given related latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 1) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (xe = 1) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate x from p given xe = 1 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate x from p modeled by g0 given xe = 1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate x from p modeled by g1 given xe = 1 limit 1
        ''').fetchall()

        # observed given unrelated latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of y = 50 given (xe = 1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of y = 50 given (xe = 1) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of y = 50 given (xe = 1) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate y from p given xe = 1 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate y from p modeled by g0 given xe = 1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate y from p modeled by g1 given xe = 1 limit 1
        ''').fetchall()

        # latent given related observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (x = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (x = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (x = 50) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate xe from p given x = 50 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modeled by g0 given x = 50 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modeled by g1 given x = 50 limit 1
        ''').fetchall()

        # latent given unrelated observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (y = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (y = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (y = 50) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate xe from p given y = 50 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modeled by g0 given y = 50 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modeled by g1 given y = 50 limit 1
        ''').fetchall()

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#34
0
def test_nig_normal_latent_2var2lat_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                (x, x*x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(
                xe deviation(x),
                ye deviation(y)
            )
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # latent given latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (ye = -1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (ye = -1) within p
                     modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (ye = -1) within p
                 modeled by g1
        ''').fetchall()

        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p given ye = -1 limit 1
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modeled by g0 given ye = -1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modeled by g1 given ye = -1 limit 1
        ''').fetchall()

        with pytest.raises(BQLError):
            bdb.execute(
                'estimate dependence probability of xe with ye within p')
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate dependence probability of xe with ye within p
                    modeled by g0
            ''')
        bdb.execute('''
            estimate dependence probability of xe with ye within p
                modeled by g1
        ''')

        with pytest.raises(BQLError):
            bdb.execute(
                'estimate mutual information of xe with ye within p')
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate mutual information of xe with ye within p
                    modeled by g0
            ''')
        bdb.execute('''
            estimate mutual information of xe with ye within p
                modeled by g1
        ''')

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#35
0
def test_initialize_with_all_nulls():
    # This test ensures that trying to initialize a generator with any
    # (manifest) column of all null variables will crash.
    # Initializing an overriden column with all null variables should not
    # be a problem in general, so we test this case as well.

    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        registry = {
            'barebones': BareBonesCGpm,
        }
        bayesdb_register_backend(
            bdb, CGPM_Backend(registry, multiprocess=0))
        # Create table with all missing values for a.
        bdb.sql_execute('''
            CREATE TABLE t (a REAL, b REAL, c REAL);
        ''')
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 3))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -2, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -5, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, 2, 3))

        # Fail when a is numerical and modeled by crosscat.
        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                SET STATTYPES OF a, b, c TO NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR m FOR p;
        ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                INITIALIZE 2 MODELS FOR m;
            ''')

        # Fail when a is nominal and modeled by crosscat.
        bdb.execute('''
            CREATE POPULATION p2 FOR t WITH SCHEMA(
                SET STATTYPES OF a TO NOMINAL;
                SET STATTYPES OF b, c TO NUMERICAL
            )
        ''')
        bdb.execute('CREATE GENERATOR m2 FOR p2;')
        with pytest.raises(BQLError):
            bdb.execute('INITIALIZE 2 MODELS FOR m2;')

        # Succeed when a is ignored.
        bdb.execute('''
            CREATE POPULATION p3 FOR t WITH SCHEMA(
                IGNORE a;
                SET STATTYPES OF b, c TO NUMERICAL
            )
        ''')
        bdb.execute('CREATE GENERATOR m3 FOR p3;')
        bdb.execute('INITIALIZE 2 MODELS FOR m3;')


        # Succeed when a is numerical overriden using a dummy CGPM.
        bdb.execute('''
            CREATE GENERATOR m4 FOR p(
                OVERRIDE MODEL FOR a GIVEN b USING barebones
            )
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m4;')
        bdb.execute('ANALYZE m4 FOR 1 ITERATION')
示例#36
0
def test_output_stattypes():
    with cgpm_dummy_satellites_bdb() as bdb:
        # Missing policy for class_of_orbit, perigee, period
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    SET STATTYPES OF apogee, launch_mass TO NUMERICAL;
                    SET STATTYPES OF country_of_operator TO NOMINAL
                )
            ''')
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                IGNORE class_of_orbit, perigee, period;
                SET STATTYPES OF apogee, launch_mass TO NUMERICAL;
                SET STATTYPES OF country_of_operator TO NOMINAL
            )
        ''')
        registry = {
            'factor_analysis': FactorAnalysis,
        }
        bayesdb_register_backend(bdb, CGPM_Backend(registry))
        # Creating factor analysis with nominal manifest should crash.
        bdb.execute('''
            CREATE GENERATOR satellites_g0 FOR satellites(
                OVERRIDE MODEL FOR apogee, country_of_operator
                AND EXPOSE pc_1 NUMERICAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0')
        with pytest.raises(BQLError):
            # Duplicate pc_2 in LATENT and EXPOSE.
            bdb.execute('''
                CREATE GENERATOR satellites_g1 FOR satellites(
                    LATENT pc_2 NOMINAL,
                    OVERRIDE GENERATIVE MODEL FOR
                        apogee, launch_mass
                    AND EXPOSE pc_2 NOMINAL
                    USING factor_analysis(L=1)
                )
            ''')
        # Creating factor analysis with nominal latent should crash.
        bdb.execute('''
            CREATE GENERATOR satellites_g1 FOR satellites(
                OVERRIDE GENERATIVE MODEL FOR
                    apogee, launch_mass
                AND EXPOSE pc_2 NOMINAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1')
        # Creating factor analysis with all numerical should be ok.
        bdb.execute('''
            CREATE GENERATOR satellites_g2 FOR satellites USING cgpm(
                LATENT pc_3 NUMERICAL;

                OVERRIDE MODEL FOR apogee, launch_mass, pc_3, pc_4
                USING factor_analysis(L=2);

                LATENT pc_4 NUMERICAL
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2')
        bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION')
        # Cannot transition baseline and foreign using timed analysis.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE satellites_g2 FOR 2 SECONDS (
                    VARIABLES country_of_operator, apogee, launch_mass, pc_3);
            ''')
        bdb.execute('''
            ANALYZE satellites_g2 FOR 1 ITERATION (
                VARIABLES apogee, launch_mass);
        ''')
        # Dependence probability of manifest with latent.
        cursor = bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY OF apogee WITH pc_3
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        assert cursor[0][0] == 1.
        # Dependence probability of latent with latent.
        cursor = bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY OF pc_3 WITH pc_4
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        assert cursor[0][0] == 1.
        # Mutual information of latent with manifest.
        cursor = bdb.execute('''
            ESTIMATE MUTUAL INFORMATION OF apogee WITH pc_4 USING 1 SAMPLES
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        # Mutual information of latent with latent.
        cursor = bdb.execute('''
            ESTIMATE MUTUAL INFORMATION OF pc_3 WITH pc_4 USING 1 SAMPLES
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
示例#37
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(0, len(countries))]
                    name = 'sat-%s-%d' % (
                        country, bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute('''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')
示例#38
0
def test_regress_bonanza__ci_integration():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(
            bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                apogee                  NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                launch_mass             NUMERICAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL;
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR m FOR satellites;
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')

        def check_regression_variables(results, numericals, nominals):
            seen = set()
            for r in results:
                assert len(r) == 2
                variable = r[0]
                assert variable not in seen
                assert variable in numericals or \
                    any(variable.startswith('%s_dum_' % (nominal,))
                        for nominal in nominals)
                seen.add(variable)

        # Regression on 1 numerical variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites;
        ''').fetchall()
        assert len(results) == 2
        check_regression_variables(results, ['intercept', 'perigee'], [])

        # Regression on 1 nominal variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept'], ['country_of_operator'])

        # Regression on 1 nominal + 1 numerical variable.
        bdb.execute('''
            REGRESS apogee GIVEN (perigee, country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept', 'perigee'], ['country_of_operator'])

        # Regression on all variables.
        results = bdb.execute('''
            REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites;
        ''', (3,)).fetchall()
        check_regression_variables(
            results,
            ['intercept', 'perigee', 'launch_mass', 'period',],
            ['country_of_operator', 'class_of_orbit',],
        )

        # Regression on column selector subexpression with a binding.
        results = bdb.execute('''
            REGRESS apogee GIVEN (
                satellites.(
                    ESTIMATE * FROM VARIABLES OF satellites
                    ORDER BY dependence probability with apogee DESC
                    LIMIT ?
                )
            )
            USING 12 SAMPLES BY satellites MODELED BY m USING MODEL 1;
        ''', (3,)).fetchall()

        cursor = bdb.execute('''
            ESTIMATE * FROM VARIABLES OF satellites
                ORDER BY dependence probability with apogee DESC
                LIMIT ?
        ''', (3,)).fetchall()
        top_variables = [c[0] for c in cursor]
        nominals = [
            var for var in top_variables
            if var in ['country_of_operator', 'class_of_orbit',]
        ]
        numericals = [var for var in top_variables if var not in nominals]
        check_regression_variables(
            results, numericals + ['intercept'], nominals)

        # Cannot mix * with other variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                REGRESS apogee GIVEN (*, class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()

        # Not enough data for regression, 1 unique nominal variable.
        with pytest.raises(ValueError):
            bdb.execute('''
                REGRESS apogee GIVEN (class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()
示例#39
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(
                        0, len(countries))]
                    name = 'sat-%s-%d' % (country,
                                          bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute(
                        '''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')
示例#40
0
def test_predictive_relevance():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(bdb, CGPM_Backend(cgpm_registry=dict()))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA (
                apogee                  NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                launch_mass             NUMERICAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL
            )
        ''')
        bdb.execute('CREATE GENERATOR m FOR satellites;')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')
        bdb.execute('ANALYZE m FOR 25 ITERATION (OPTIMIZED);')

        # Check self-similarites, and also provide coverage of bindings.
        rowids = bdb.execute('SELECT OID from satellites_ucs;').fetchall()
        for rowid in rowids[:4]:
            cursor = bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    TO EXISTING ROWS (rowid = ?)
                    IN THE CONTEXT OF "period"
                FROM satellites
                WHERE rowid = ?
            ''', (1, 1,))
            assert next(cursor)[0] == 1.

        # A full extravaganza query, using FROM (as a 1-row).
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                TO EXISTING ROWS
                    (country_of_operator = 'Russia' AND period < 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (perigee=1.0, launch_mass=120),
                    (country_of_operator='Bulgaria', perigee=2.0))
                IN THE CONTEXT OF "country_of_operator"
            FROM satellites
            LIMIT 5
        ''').fetchall()
        assert len(cursor) == 5
        assert all(0 <= c[0] <= 1 for c in cursor)

        # A full extravaganza query, using BY (as a constant).
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS
                    (country_of_operator = 'Russia' AND period < 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0),
                    (country_of_operator='Bulgaria'))
                IN THE CONTEXT OF "country_of_operator"
            BY satellites
        ''').fetchall()
        assert len(cursor) == 1
        assert all(0 <= c[0] <= 1 for c in cursor)

        # Hypothetical satellite with negative perigee should not be similar,
        # and use a binding to just ensure that they work.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                TO HYPOTHETICAL ROWS WITH VALUES (
                    (perigee = ?))
                IN THE CONTEXT OF "perigee"
            FROM satellites
            LIMIT 5
        ''' , (-10000,)).fetchall()
        assert len(cursor) == 5
        assert all(np.allclose(c[0], 0) for c in cursor)

        # No matching target OF row.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid < 0) TO EXISTING ROWS (rowid = 10)
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Unknown CONTEXT variable "banana".
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 1) TO EXISTING ROWS (rowid = 2)
                    IN THE CONTEXT OF "banana"
                BY satellites
            ''')

        # No matching EXISTING ROW.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 10) TO EXISTING ROWS (rowid < 0)
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Unknown nominal values 'Mongolia' in HYPOTHETICAL ROWS.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 10)
                    TO HYPOTHETICAL ROWS WITH VALUES (
                        (country_of_operator='Mongolia'),
                        (country_of_operator='Bulgaria', perigee=2.0))
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Create a new row.
        bdb.sql_execute('''
            INSERT INTO satellites_ucs
            (apogee, launch_mass) VALUES (12.128, 12.128)
        ''')

        # TARGET ROW not yet incorporated should return nan.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (apogee = 12.128)
                TO HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0))
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is None

        # EXISTING ROW not yet incorporated should return nan, since there is
        # no hypothetical.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128)
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is None

        # Although apogee = 12.128 is EXISTING but not incorporated, there are
        # other EXISTING ROWS with apogee > 0, so we should still get a result.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128 OR apogee > 0)
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is not None

        # Although apogee = 12.128 is EXISTING but not incorporated, there are
        # other HYPOTHETICAL ROWS, so we should still get a result.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128 OR apogee > 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0),
                    (country_of_operator='Bulgaria'))
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is not None
示例#41
0
def test_analysis_subproblems_basic():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee TO NUMERICAL;
                SET STATTYPE OF class_of_orbit TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass TO NUMERICAL;
                SET STATTYPE OF perigee TO NUMERICAL;
                SET STATTYPE OF period TO NUMERICAL
            )
        ''')
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 4 MODELS FOR g0')

        # Test each subproblem individually except for variable hyperparameters.
        for optimized in [
                '',
                'OPTIMIZED;',
        ]:
            for subproblem in [
                    'variable clustering',
                    'variable clustering concentration',
                    'row clustering',
                    'row clustering concentration',
            ]:
                bdb.execute('''
                    ANALYZE g0 MODELS 0,1 FOR 4 ITERATION(
                        SUBPROBLEM %s;
                        %s
                    );
                ''' % (subproblem, optimized))

        # Test variable hyperparameters.
        bdb.execute('''
            ANALYZE g0 FOR 1 ITERATION (
                VARIABLES period, launch_mass;
                SUBPROBLEM variable hyperparameters;
            )
        ''')
        with pytest.raises(BQLError):
            # OPTIMIZED backend does not support variable hyperparameters.
            bdb.execute('''
                ANALYZE g0 FOR 1 SECONDS (
                    SUBPROBLEM variable hyperparameters;
                    OPTIMIZED;
                )
            ''')

        # Test rows.
        generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0')
        cursor = bdb.execute(
            '''
            SELECT table_rowid FROM  bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, ))
        subsample_rows = [c[0] for c in cursor]
        bad_rows = [i for i in xrange(20) if i not in subsample_rows]
        for optimized in ['', 'OPTIMIZED;']:
            bdb.execute('''
                ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                    VARIABLES class_of_orbit;
                    ROWS %s;
                    SUBPROBLEMS (
                        row clustering,
                        row clustering concentration
                    );
                    %s
            )
            ''' % (','.join(map(str, subsample_rows)), optimized))
            with pytest.raises(BQLError):
                # Fail on rows not in the population or subsample.
                bdb.execute('''
                    ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                        VARIABLES class_of_orbit;
                        ROWS %s;
                        SUBPROBLEMS (
                            row clustering,
                            row clustering concentration
                        );
                        %s
                )
                ''' % (','.join(map(str, bad_rows)), optimized))
示例#42
0
def test_nig_normal_latent_2var_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                (x, x*x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')

        # CORRELATION, CORRELATION PVALUE, without generators.
        assert 4 == len(bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # CORRELATION, CORRELATION PVALUE, with generators.
        assert 4 == len(bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())
        assert 4 == len(bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p modeled by g0
        ''').fetchall())
        with pytest.raises(BQLError):
            # g1 has a latent variable xe.
            assert 4 == len(bdb.execute('''
                estimate correlation, correlation pvalue
                    from pairwise variables of p modeled by g1
            ''').fetchall())

        # DEPENDENCE PROBABILITY, MUTUAL INFORMATION
        assert 4 == len(bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p
        ''').fetchall())
        assert 4 == len(bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p modeled by g0
        ''').fetchall())
        assert 9 == len(bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p modeled by g1
        ''').fetchall())

        # SIMULATE LATENT VARIABLE
        assert 10 == len(bdb.execute('''
            simulate xe from p modeled by g1 limit 10;
        ''').fetchall())
        assert 10 == len(bdb.execute('''
            simulate y, xe from p modeled by g1 limit 10;
        ''').fetchall())
        # Cannot simulate the latent xe from the population p.
        with pytest.raises(BQLError):
            assert 10 == len(bdb.execute('''
                simulate xe from p limit 10;
            ''').fetchall())
        # Cannot simulate the latent xe from the generator g0.
        with pytest.raises(BQLError):
            assert 10 == len(bdb.execute('''
                simulate xe from p modeled by g0 limit 10;
            ''').fetchall())

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
def test_loom_four_var():
    """Test Loom on a four variable table.
    Table consists of:
    * x - a random int between 0 and 200
    * y - a random int between 0 and 100
    * xx - just 2*x
    * z - a nominal variable that has an even
    chance of being 'a' or 'b'

    Queries run and tested include:
    estimate similarity, estimate probability density, simulate,
    estimate mutual information, estimate dependence probability,
    infer explicit predict
    """
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t(x, xx, y, z)')
            bdb.sql_execute('''
                insert into t (x, xx, y, z) values (100, 200, 50, 'a')''')
            bdb.sql_execute('''
                insert into t (x, xx, y, z) values (100, 200, 50, 'a')''')
            for _index in xrange(100):
                x = bdb._prng.weakrandom_uniform(X_MAX)
                bdb.sql_execute(
                    '''
                    insert into t(x, xx, y, z) values(?, ?, ?, ?)
                    ''',
                    (x, x * 2, int(bdb._prng.weakrandom_uniform(Y_MAX)),
                     'a' if bdb._prng.weakrandom_uniform(2) == 1 else 'b'))

            bdb.execute('''
                create population p for t(x numerical; xx numerical;
                y numerical; z nominal)''')
            bdb.execute('create generator g for p using loom')
            bdb.execute('initialize 10 model for g')
            bdb.execute('analyze g for 20 iterations')

            with pytest.raises(BQLError):
                relevance = bdb.execute('''
                    estimate
                        predictive relevance
                            to hypothetical rows with values ((x=50, xx=100))
                            in the context of "x"
                    from p
                    where rowid = 1
                ''').fetchall()

            relevance = bdb.execute('''
                estimate
                    predictive relevance
                        to existing rows (rowid = 1)
                        in the context of "x"
                from p
                where rowid = 1
            ''').fetchall()
            assert relevance[0][0] == 1

            similarities = bdb.execute('''estimate similarity
                in the context of x from pairwise p limit 2''').fetchall()
            assert similarities[0][2] <= 1
            assert similarities[1][2] <= 1
            assert abs(similarities[0][2] - similarities[1][2]) < 0.005

            impossible_density = bdb.execute(
                'estimate probability density of x = %d by p' %
                (X_MAX * 2.5, )).fetchall()
            assert impossible_density[0][0] < 0.0001

            possible_density = bdb.execute(
                'estimate probability density of x = %d  by p' %
                ((X_MAX - X_MIN) / 2, )).fetchall()
            assert possible_density[0][0] > 0.001

            nominal_density = bdb.execute('''
                estimate probability density of z = 'a' by p
            ''').fetchall()
            assert abs(nominal_density[0][0] - .5) < 0.2

            mutual_info = bdb.execute('''
                estimate mutual information as mutinf
                from pairwise columns of p order by mutinf
            ''').fetchall()
            _, a, b, c = zip(*mutual_info)
            mutual_info_dict = dict(zip(zip(a, b), c))
            assert mutual_info_dict[('x', 'y')] < mutual_info_dict[(
                'x', 'xx')] < mutual_info_dict[('x', 'x')]

            simulated_data = bdb.execute('simulate x, y from p limit %d' %
                                         (PREDICT_RUNS, )).fetchall()
            xs, ys = zip(*simulated_data)
            assert abs((sum(xs)/len(xs)) - (X_MAX-X_MIN)/2) < \
                    (X_MAX-X_MIN)/5
            assert abs((sum(ys)/len(ys)) - (Y_MAX-Y_MIN)/2) < \
                    (Y_MAX-Y_MIN)/5
            assert sum([1 if (x < Y_MIN or x > X_MAX) else 0
                        for x in xs]) < .5 * PREDICT_RUNS
            assert sum([1 if (y < Y_MIN or y > Y_MAX) else 0
                        for y in ys]) < .5 * PREDICT_RUNS

            dependence = bdb.execute('''estimate dependence probability
                from pairwise variables of p''').fetchall()
            for (_, col1, col2, d_val) in dependence:
                if col1 == col2:
                    assert d_val == 1
                elif col1 in ['xx', 'x'] and col2 in ['xx', 'x']:
                    assert d_val > 0.80
                else:
                    assert d_val < 0.20
            predict_confidence = bdb.execute(
                'infer explicit predict x confidence x_c FROM p').fetchall()
            predictions, confidences = zip(*predict_confidence)
            assert abs((sum(predictions) / len(predictions)) -
                       (X_MAX - X_MIN) / 2) < (X_MAX - X_MIN) / 5
            assert sum(
                [1 if (p < X_MIN or p > X_MAX) else 0
                 for p in predictions]) < .5 * PREDICT_RUNS
            assert all([c == 0 for c in confidences])
示例#44
0
def test_using_modelnos():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee              TO NUMERICAL;
                SET STATTYPE OF class_of_orbit      TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass         TO NUMERICAL;
                SET STATTYPE OF perigee             TO NUMERICAL;
                SET STATTYPE OF period              TO NUMERICAL
            )
        ''')
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR g0')

        # Crash test simulate.
        bdb.execute('''
            SIMULATE apogee, class_of_orbit
            FROM satellites
            MODELED BY g0
            USING MODEL 0-1
            LIMIT 10
        ''')
        # Crash test infer explicit.
        bdb.execute('''
            INFER EXPLICIT PREDICT period, perigee
            FROM satellites
            MODELED BY g0
            USING MODEL 0
            LIMIT 2
        ''')
        # Crash test dependence probability BY.
        c = bdb.execute('''
            ESTIMATE
                DEPENDENCE PROBABILITY OF launch_mass WITH period
            BY satellites
            MODELED BY g0
            USING MODEL 0
        ''')
        assert cursor_value(c) in [0, 1]
        # Crash test dependence probability pairwise.
        cursor = bdb.execute('''
            ESTIMATE
                DEPENDENCE PROBABILITY
            FROM PAIRWISE VARIABLES OF satellites
            MODELED BY g0
            USING MODEL 1
        ''')
        for d in cursor:
            assert d[0] in [0, 1]
        # Crash test mutual information 1row.
        bdb.execute('''
            ESTIMATE
                MUTUAL INFORMATION WITH (period) USING 1 SAMPLES
            FROM VARIABLES OF satellites
            USING MODEL 0
        ''').fetchall()
        # Test analyze on per-model basis.
        bdb.execute('''
            ANALYZE g0 MODEL 0 FOR 1 ITERATION CHECKPOINT 1 ITERATION
        ''')
        engine = bdb.backends['cgpm']._engine(bdb, 1)
        assert len(engine.states[0].diagnostics['logscore']) == 1
        assert len(engine.states[1].diagnostics['logscore']) == 0
        bdb.execute('''
            ANALYZE g0 MODEL 1 FOR 4 ITERATION CHECKPOINT 1 ITERATION (
                OPTIMIZED
            );
        ''')
        assert len(engine.states[0].diagnostics['logscore']) == 1
        assert len(engine.states[1].diagnostics['logscore']) == 4
        # Some errors with bad modelnos.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g0 MODEL 0-3 FOR 4 ITERATION
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                SIMULATE apogee FROM satellites USING MODEL 25 LIMIT 10;
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
                USING MODELS 0-8 LIMIT 2;
            ''')
示例#45
0
def test_nig_normal_latent_2var2lat_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(
                xe deviation(x),
                ye deviation(y)
            )
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # latent given latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (ye = -1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (ye = -1) within p
                     modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (ye = -1) within p
                 modeled by g1
        ''').fetchall()

        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p given ye = -1 limit 1
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modeled by g0 given ye = -1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modeled by g1 given ye = -1 limit 1
        ''').fetchall()

        with pytest.raises(BQLError):
            bdb.execute(
                'estimate dependence probability of xe with ye within p')
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate dependence probability of xe with ye within p
                    modeled by g0
            ''')
        bdb.execute('''
            estimate dependence probability of xe with ye within p
                modeled by g1
        ''')

        with pytest.raises(BQLError):
            bdb.execute('estimate mutual information of xe with ye within p')
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate mutual information of xe with ye within p
                    modeled by g0
            ''')
        bdb.execute('''
            estimate mutual information of xe with ye within p
                modeled by g1
        ''')

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#46
0
def test_add_drop_models():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(
            bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION p FOR satellites_ucs WITH SCHEMA(
                GUESS STATTYPES OF (*);
            )
        ''')
        bdb.execute('CREATE GENERATOR m FOR p (SUBSAMPLE 10);')

        # Retrieve id for testing.
        population_id = bayesdb_get_population(bdb, 'p')
        generator_id = bayesdb_get_generator(bdb, population_id, 'm')

        def check_modelno_mapping(lookup):
            pairs = bdb.sql_execute('''
                SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno
                WHERE generator_id = ?
            ''', (generator_id,))
            for pair in pairs:
                assert lookup[pair[0]] == pair[1]
                del lookup[pair[0]]
            assert len(lookup) == 0

        # Initialize some models.
        bdb.execute('INITIALIZE 16 MODELS FOR m')
        # Assert identity mapping initially.
        check_modelno_mapping({i:i for i in xrange(16)})

        bdb.execute('ANALYZE m FOR 1 ITERATION (QUIET);')

        # Drop some models.
        bdb.execute('DROP MODELS 1, 8-12, 14 FROM m')
        # Assert cgpm models are contiguous while bayesdb models are not, with
        # the mapping preserving the strict order.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
        })

        # Run some analysis again.
        bdb.execute('ANALYZE m FOR 1 ITERATION (OPTIMIZED; QUIET);')

        # Initialize 14 models if not existing.
        bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m')
        # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note
        # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14
        # MODELS in total will exist after the query, rather it will initialize
        # any non-existing modelnos with index 0-13, and any modelnos > 14
        # (modelno 15 in this test case) are untouched.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
            # Recreated models.
            1: 9,
            8: 10,
            9: 11,
            10: 12,
            11: 13,
            12: 14,
        })

        # Drop some more models, add them back with some more, and confirm
        # arithmetic and ordering remains correct.
        bdb.execute('DROP MODELS 0-1 FROM m')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
        })
        bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
            # Re-recreated models.
            0: 13,
            1: 14,
            # New models.
            14: 15,
            16: 16,
            17: 17,
            18: 18,
            19: 19,
        })

        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODELS 20-50 FROM m')
        # Drop all models.
        bdb.execute('DROP MODELS FROM m;')
        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODEL 0 FROM m')
        # Assert cgpm mapping is cleared.
        cursor = bdb.sql_execute('''
            SELECT COUNT(*) FROM bayesdb_cgpm_modelno
            WHERE generator_id = ?
        ''', (generator_id,))
        assert cursor_value(cursor) == 0
示例#47
0
def test_analysis_subproblems_basic():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee TO NUMERICAL;
                SET STATTYPE OF class_of_orbit TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass TO NUMERICAL;
                SET STATTYPE OF perigee TO NUMERICAL;
                SET STATTYPE OF period TO NUMERICAL
            )
        ''')
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 4 MODELS FOR g0')

        # Test each subproblem individually except for variable hyperparameters.
        for optimized in ['', 'OPTIMIZED;',]:
            for subproblem in [
                'variable clustering',
                'variable clustering concentration',
                'row clustering',
                'row clustering concentration',
            ]:
                bdb.execute('''
                    ANALYZE g0 MODELS 0,1 FOR 4 ITERATION(
                        SUBPROBLEM %s;
                        %s
                    );
                ''' % (subproblem, optimized))

        # Test variable hyperparameters.
        bdb.execute('''
            ANALYZE g0 FOR 1 ITERATION (
                VARIABLES period, launch_mass;
                SUBPROBLEM variable hyperparameters;
            )
        ''')
        with pytest.raises(BQLError):
            # OPTIMIZED backend does not support variable hyperparameters.
            bdb.execute('''
                ANALYZE g0 FOR 1 SECONDS (
                    SUBPROBLEM variable hyperparameters;
                    OPTIMIZED;
                )
            ''')

        # Test rows.
        generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0')
        cursor = bdb.execute('''
            SELECT table_rowid FROM  bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id,))
        subsample_rows = [c[0] for c in cursor]
        bad_rows = [i for i in xrange(20) if i not in subsample_rows]
        for optimized in ['', 'OPTIMIZED;']:
            bdb.execute('''
                ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                    VARIABLES class_of_orbit;
                    ROWS %s;
                    SUBPROBLEMS (
                        row clustering,
                        row clustering concentration
                    );
                    %s
            )
            ''' % (','.join(map(str, subsample_rows)), optimized))
            with pytest.raises(BQLError):
                # Fail on rows not in the population or subsample.
                bdb.execute('''
                    ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                        VARIABLES class_of_orbit;
                        ROWS %s;
                        SUBPROBLEMS (
                            row clustering,
                            row clustering concentration
                        );
                        %s
                )
                ''' % (','.join(map(str, bad_rows)), optimized))
示例#48
0
def test_nig_normal_latent_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x,))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # PROBABILITY DENSITY OF x = v
        bdb.execute('estimate probability density of x = 50 within p') \
            .fetchall()
        with pytest.raises(BQLError):
            bdb.execute('estimate probability density of xe = 1 within p') \
                .fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 within p modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 within p modeled by g1
        ''').fetchall()

        # PREDICTIVE PROBABILITY OF x
        bdb.execute('estimate predictive probability of x from p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'estimate predictive probability of xe from p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate predictive probability of xe from p modeled by g0
            ''').fetchall()
        for r, p_xe in bdb.execute('''
            estimate rowid, predictive probability of xe from p modeled by g1
        '''):
            assert p_xe is None, 'rowid %r p(xe) %r' % (r, p_xe)

        # INFER/PREDICT
        bdb.execute(
            'INFER EXPLICIT PREDICT x CONFIDENCE x_c FROM p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p
                    MODELED BY g0
            ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p
                MODELED BY g1
        ''').fetchall()

        # SIMULATE x
        bdb.execute('simulate x from p limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate x, xe from p limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'simulate x, xe from p modeled by g0 limit 1').fetchall()
        bdb.execute('simulate x, xe from p modeled by g1 limit 1').fetchall()

        assert 100 == len(bdb.execute('''
            estimate similarity in the context of x from pairwise p limit 100
        ''').fetchall())
        assert 1 == len(bdb.execute('''
            estimate similarity in the context of x
            from pairwise p modeled by g0 limit 1
        ''').fetchall())
                # No such column xe in g0.
        with pytest.raises(BQLError):
            assert 1 == len(bdb.execute('''
                estimate similarity in the context of xe
                    from pairwise p modeled by g0 limit 1
            ''').fetchall())
        # Column xe exists in g1.
        assert 1 == len(bdb.execute('''
            estimate similarity in the context of xe
                from pairwise p modeled by g1 limit 1
        ''').fetchall())

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
def test_loom_complex_add_analyze_drop_sequence():
    with tempdir('bayeslite-loom') as loom_store_path:
        with bayesdb_open(':memory:') as bdb:
            bayesdb_register_backend(
                bdb, LoomBackend(loom_store_path=loom_store_path))
            bdb.sql_execute('create table t (x)')
            for x in xrange(10):
                bdb.sql_execute('insert into t (x) values (?)', (x, ))
            bdb.execute('create population p for t (x numerical)')
            bdb.execute('create generator g for p using loom')

            bdb.execute('initialize 2 models for g')

            bdb.execute('initialize 3 models if not exists for g')
            population_id = bayesdb_get_population(bdb, 'p')
            generator_id = bayesdb_get_generator(bdb, population_id, 'g')
            cursor = bdb.sql_execute(
                '''
                SELECT num_models FROM bayesdb_loom_generator_model_info
                    WHERE generator_id=?;
            ''', (generator_id, ))
            num_models = cursor.fetchall()[0][0]
            # Make sure that the total number of models is
            # 3 and not 2 + 3 = 5.
            assert num_models == 3

            bdb.execute('analyze g for 10 iterations')
            bdb.execute('estimate probability density of x = 50 from p')

            with pytest.raises(BQLError):
                bdb.execute('drop model 1 from g')
            bdb.execute('drop models from g')

            bdb.execute('initialize 1 models for g')
            population_id = bayesdb_get_population(bdb, 'p')
            generator_id = bayesdb_get_generator(bdb, population_id, 'g')
            cursor = bdb.sql_execute(
                '''
                SELECT num_models FROM bayesdb_loom_generator_model_info
                    WHERE generator_id=?;
            ''', (generator_id, ))
            num_models = cursor.fetchall()[0][0]
            # Make sure that the number of models was reset after dropping.
            assert num_models == 1
            bdb.execute('analyze g for 50 iterations')

            cursor = bdb.execute('''
                estimate probability density of x = 50 from p''')
            probDensityX1 = cursor.fetchall()
            probDensityX1 = [x[0] for x in probDensityX1]
            bdb.execute('simulate x from p limit 1').fetchall()
            bdb.execute('drop models from g')

            bdb.execute('initialize 1 model for g')
            bdb.execute('analyze g for 50 iterations')
            cursor = bdb.execute('''
                estimate probability density of x = 50 from p''')
            probDensityX2 = cursor.fetchall()
            probDensityX2 = [x[0] for x in probDensityX2]
            # Check that the analysis started fresh after dropping models
            # and that it produces similar results the second time.
            for px1, px2 in zip(probDensityX1, probDensityX2):
                assert abs(px1 - px2) < .01
            bdb.execute('drop models from g')
            bdb.execute('drop generator g')
            bdb.execute('drop population p')
            bdb.execute('drop table t')
def register_loom(bdb):
    loom_store_path = temp_file_path('.bdb')
    loom_backend = LoomBackend(loom_store_path=loom_store_path)
    bayeslite.bayesdb_register_backend(bdb, loom_backend)
示例#51
0
def test_unknown_stattype():
    from cgpm.regressions.linreg import LinearRegression
    with cgpm_dummy_satellites_bdb() as bdb:
        # Add a column called relaunches, sum of apogee and perigee.
        bdb.sql_execute('ALTER TABLE satellites_ucs ADD COLUMN relaunches')
        n_rows = bdb.sql_execute('''
            SELECT COUNT(*) FROM satellites_ucs
        ''').next()[0]
        for rowid in xrange(n_rows):
            bdb.sql_execute('''
                UPDATE satellites_ucs
                    SET relaunches = (SELECT apogee + perigee)
                    WHERE _rowid_ = ?
            ''', (rowid+1,))
        # Nobody will ever create a QUAGGA statistical type!
        with pytest.raises(BQLError):
            # No such statistical type at the moment.
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    SET STATTYPES OF apogee, perigee, launch_mass, period
                        TO NUMERICAL;

                    SET STATTYPE OF class_of_orbit, country_of_operator
                        TO NOMINAL;

                    SET STATTYPE OF relaunches
                        TO QUAGGA
                )
            ''')
        # Invent the statistical type.
        bdb.sql_execute('INSERT INTO bayesdb_stattype VALUES (?)', ('quagga',))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPES OF apogee, perigee, launch_mass, period
                    TO NUMERICAL;

                SET STATTYPES OF class_of_orbit, country_of_operator
                TO NOMINAL;

                SET STATTYPES OF relaunches
                TO QUAGGA
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_backend(bdb, CGPM_Backend(registry))
        with pytest.raises(BQLError):
            # Can't model QUAGGA by default.
            bdb.execute('CREATE GENERATOR g0 FOR satellites USING cgpm')
        with pytest.raises(BQLError):
            # Can't model QUAGGA as input.
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR relaunches GIVEN apogee USING linreg;
                    OVERRIDE MODEL FOR period GIVEN relaunches USING linreg
                )
            ''')
        # Can model QUAGGA with an explicit distribution family.
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR relaunches TO POISSON
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR g1 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR relaunches TO POISSON;
                OVERRIDE MODEL FOR period GIVEN relaunches USING linreg
            )
        ''')
示例#52
0
def test_nig_normal_latent_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x, ))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # PROBABILITY DENSITY OF x = v
        bdb.execute('estimate probability density of x = 50 within p') \
            .fetchall()
        with pytest.raises(BQLError):
            bdb.execute('estimate probability density of xe = 1 within p') \
                .fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 within p modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 within p modeled by g1
        ''').fetchall()

        # PREDICTIVE PROBABILITY OF x
        bdb.execute('estimate predictive probability of x from p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'estimate predictive probability of xe from p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate predictive probability of xe from p modeled by g0
            ''').fetchall()
        for r, p_xe in bdb.execute('''
            estimate rowid, predictive probability of xe from p modeled by g1
        '''):
            assert p_xe is None, 'rowid %r p(xe) %r' % (r, p_xe)

        # INFER/PREDICT
        bdb.execute(
            'INFER EXPLICIT PREDICT x CONFIDENCE x_c FROM p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p
                    MODELED BY g0
            ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p
                MODELED BY g1
        ''').fetchall()

        # SIMULATE x
        bdb.execute('simulate x from p limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate x, xe from p limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'simulate x, xe from p modeled by g0 limit 1').fetchall()
        bdb.execute('simulate x, xe from p modeled by g1 limit 1').fetchall()

        assert 100 == len(
            bdb.execute('''
            estimate similarity in the context of x from pairwise p limit 100
        ''').fetchall())
        assert 1 == len(
            bdb.execute('''
            estimate similarity in the context of x
            from pairwise p modeled by g0 limit 1
        ''').fetchall())
        # No such column xe in g0.
        with pytest.raises(BQLError):
            assert 1 == len(
                bdb.execute('''
                estimate similarity in the context of xe
                    from pairwise p modeled by g0 limit 1
            ''').fetchall())
        # Column xe exists in g1.
        assert 1 == len(
            bdb.execute('''
            estimate similarity in the context of xe
                from pairwise p modeled by g1 limit 1
        ''').fetchall())

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#53
0
def test_cgpm_kepler():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                apogee                  NUMERICAL;
                launch_mass             NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL
            )
        ''')
        bdb.execute('''
            ESTIMATE CORRELATION from PAIRWISE VARIABLES OF satellites
        ''').fetchall()
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_backend(
            bdb, CGPM_Backend(registry, multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                OVERRIDE GENERATIVE MODEL FOR period
                GIVEN apogee, perigee
                USING linreg
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR g0')
        c = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual')
        n = c.fetchvalue()
        # Another generator: exponential launch mass instead of normal.
        bdb.execute('''
            CREATE GENERATOR g1 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR launch_mass TO EXPONENTIAL;
                OVERRIDE MODEL FOR period GIVEN apogee, perigee
                    USING kepler(quagga = eland);
                SUBSAMPLE 20
            )
        ''')
        c_ = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual')
        n_ = c_.fetchvalue()
        assert n_ - n == 20
        bdb.execute('INITIALIZE 1 MODEL IF NOT EXISTS FOR g1')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION (VARIABLES period)')
        bdb.execute('ANALYZE g1 FOR 1 ITERATION')
        bdb.execute('ANALYZE g1 FOR 1 ITERATION (VARIABLES period)')
        # OPTIMIZED is ignored because period is a foreign variable.
        bdb.execute('''
            ANALYZE g1 FOR 1 ITERATION (OPTIMIZED; VARIABLES period)
        ''')
        # This should fail since we have a SET CATEGORY MODEL which is not
        # compatible with lovecat. The ValueError is from cgpm not bayeslite.
        with pytest.raises(ValueError):
            bdb.execute('''
                ANALYZE g1 FOR 1 ITERATION
                    (OPTIMIZED; VARIABLES launch_mass)
            ''')
        # Cannot use timed analysis with mixed variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g1 FOR 5 SECONDS (VARIABLES period, apogee)
            ''')
        # Cannot use timed analysis with mixed variables (period by SKIP).
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g1 FOR 5 SECONDS (SKIP apogee)
            ''')
        # OK to use iteration analysis with mixed values.
        bdb.execute('''
                ANALYZE g1 FOR 1 ITERATION (VARIABLES period, apogee)
            ''')
        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                FROM PAIRWISE VARIABLES OF satellites
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()
        bdb.execute('''
            SIMULATE apogee, perigee, period FROM satellites LIMIT 100
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                PREDICT apogee
                    CONFIDENCE apogee_confidence
                    USING 5 SAMPLES
            FROM satellites LIMIT 2
        ''').fetchall()
        results = bdb.execute('''
            INFER EXPLICIT
                PREDICT class_of_orbit
                    CONFIDENCE class_of_orbit_confidence
            FROM satellites LIMIT 2
        ''').fetchall()
        assert len(results[0]) == 2
        assert isinstance(results[0][0], unicode)
        assert isinstance(results[0][1], float)
        # No CONFIDENCE specified.
        results = bdb.execute('''
            INFER EXPLICIT PREDICT class_of_orbit USING 2 SAMPLES
            FROM satellites LIMIT 2
        ''').fetchall()
        assert len(results[0]) == 1
        assert isinstance(results[0][0], unicode)
        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP GENERATOR g1')
示例#54
0
def test_nig_normal_latent_2var_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')

        # CORRELATION, CORRELATION PVALUE, without generators.
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # observed given other observed
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
                modeled by g0
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
                modeled by g1
        ''').fetchall()
        bdb.execute('simulate x from p given y = 49 limit 1').fetchall()
        bdb.execute('''
            simulate x from p modeled by g0 given y = 49 limit 1
        ''').fetchall()
        bdb.execute('''
            simulate x from p modeled by g1 given y = 49 limit 1
        ''').fetchall()

        # observed given related latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 1) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (xe = 1) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate x from p given xe = 1 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate x from p modeled by g0 given xe = 1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate x from p modeled by g1 given xe = 1 limit 1
        ''').fetchall()

        # observed given unrelated latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of y = 50 given (xe = 1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of y = 50 given (xe = 1) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of y = 50 given (xe = 1) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate y from p given xe = 1 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate y from p modeled by g0 given xe = 1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate y from p modeled by g1 given xe = 1 limit 1
        ''').fetchall()

        # latent given related observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (x = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (x = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (x = 50) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate xe from p given x = 50 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modeled by g0 given x = 50 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modeled by g1 given x = 50 limit 1
        ''').fetchall()

        # latent given unrelated observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (y = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (y = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (y = 50) within p
                modeled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate xe from p given y = 50 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modeled by g0 given y = 50 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modeled by g1 given y = 50 limit 1
        ''').fetchall()

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
示例#55
0
def test_nig_normal_latent_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x, ))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration')

        # observed given observed
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
                modeled by g0
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
                modeled by g1
        ''').fetchall()

        # observed given latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (xe = 50) within p
                modeled by g1
        ''').fetchall()

        # latent given observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 50 given (x = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 50 given (x = 50) within p
                    modeled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 50 given (x = 50) within p
                modeled by g1
        ''').fetchall()

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')