Exemplo n.º 1
0
def t1_mp():
    crosscat = multiprocessing_crosscat()
    metamodel = CrosscatMetamodel(crosscat)
    return bayesdb_population(bayesdb(metamodel=metamodel),
        't1', 'p1', 'p1_cc', t1_schema, t1_data,
         columns=['id IGNORE','label CATEGORICAL', 'age NUMERICAL',
            'weight NUMERICAL'])
Exemplo n.º 2
0
def test_hackmetamodel():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)')
    bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')")
    bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    crosscat = local_crosscat()
    crosscat_metamodel = CrosscatMetamodel(crosscat)
    dotdog_metamodel = DotdogMetamodel()
    bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel)
    bayeslite.bayesdb_deregister_metamodel(bdb, dotdog_metamodel)
    bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel)
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    # XXX Rest of test originally exercised default metamodel, but
    # syntax doesn't support that now.  Not clear that's wrong either.
    bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
Exemplo n.º 3
0
def test_guess_population():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)')
    a_z = range(ord('a'), ord('z') + 1)
    aa_zz = ((c, d) for c in a_z for d in a_z)
    data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz)
    for row in data:
        bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        # No modelled columns.  (x is key.)
        bayesdb_guess_population(bdb,
                                 'p',
                                 't',
                                 overrides=[('y', 'ignore'), ('z', 'ignore')])
    bayesdb_guess_population(bdb, 'p', 't')
    with pytest.raises(ValueError):
        # Population already exists.
        bayesdb_guess_population(bdb, 'p', 't')
    assert bdb.sql_execute('SELECT * FROM bayesdb_variable').fetchall() == [
        (1, None, 1, 'y', 'nominal'),
        (1, None, 2, 'z', 'numerical'),
    ]
Exemplo n.º 4
0
def test_impossible_duplicate_dependency():
    # Throw exception when two columns X and Y are both dependent and
    # independent.

    data = [(0, 1, 0, 0), (1, 0, 0, 1)]

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,a,b,c)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row)

        # Create schema, we will force DEP(a c) and IND(a c).
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                a CATEGORICAL,
                b CATEGORICAL,
                c CATEGORICAL,
                INDEPENDENT(a,b,c),
                DEPENDENT(a,c),
            );
        '''

        # An error should be thrown about impossible schema.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute(bql)
Exemplo n.º 5
0
def test_crosscat_constraints():
    class FakeEngine(crosscat.LocalEngine.LocalEngine):
        def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y,
                                              Q):
            self._last_Y = Y
            sup = super(FakeEngine, self)
            return sup.simple_predictive_probability_multistate(
                M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)

        def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).simple_predictive_sample(seed=seed,
                                                                    M_c=M_c,
                                                                    X_L=X_L,
                                                                    X_D=X_D,
                                                                    Y=Y,
                                                                    Q=Q,
                                                                    n=n)

        def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).impute_and_confidence(seed=seed,
                                                                 M_c=M_c,
                                                                 X_L=X_L,
                                                                 X_D=X_D,
                                                                 Y=Y,
                                                                 Q=Q,
                                                                 n=n)

    engine = FakeEngine(seed=0)
    mm = CrosscatMetamodel(engine)
    with bayesdb(metamodel=mm) as bdb:
        t1_schema(bdb)
        t1_data(bdb)
        bdb.execute('''
            CREATE GENERATOR t1_cc FOR t1 USING crosscat(
                label CATEGORICAL,
                age NUMERICAL,
                weight NUMERICAL
            )
        ''')
        gid = core.bayesdb_get_generator(bdb, 't1_cc')
        assert core.bayesdb_generator_column_number(bdb, gid, 'label') == 1
        assert core.bayesdb_generator_column_number(bdb, gid, 'age') == 2
        assert core.bayesdb_generator_column_number(bdb, gid, 'weight') == 3
        from bayeslite.metamodels.crosscat import crosscat_cc_colno
        assert crosscat_cc_colno(bdb, gid, 1) == 0
        assert crosscat_cc_colno(bdb, gid, 2) == 1
        assert crosscat_cc_colno(bdb, gid, 3) == 2
        bdb.execute('INITIALIZE 1 MODEL FOR t1_cc')
        bdb.execute('ANALYZE t1_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE PROBABILITY OF age = 8 GIVEN (weight = 16)'
                    ' BY t1_cc').next()
        assert engine._last_Y == [(28, 2, 16)]
        bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next()
        bdb.execute("INFER age FROM t1_cc WHERE label = 'baz'").next()
        assert engine._last_Y == [(3, 0, 1), (3, 2, 32)]
        bdb.execute('SIMULATE weight FROM t1_cc GIVEN age = 8 LIMIT 1').next()
        assert engine._last_Y == [(28, 1, 8)]
Exemplo n.º 6
0
def test_crosscat_constraints():
    class FakeEngine(crosscat.LocalEngine.LocalEngine):
        def predictive_probability_multistate(self, M_c, X_L_list,
                X_D_list, Y, Q):
            self._last_Y = Y
            sup = super(FakeEngine, self)
            return sup.simple_predictive_probability_multistate(M_c=M_c,
                X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)
        def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).simple_predictive_sample(seed=seed,
                M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n)
        def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n):
            self._last_Y = Y
            return super(FakeEngine, self).impute_and_confidence(seed=seed,
                M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n)
    engine = FakeEngine(seed=0)
    mm = CrosscatMetamodel(engine)
    with bayesdb(metamodel=mm) as bdb:
        t1_schema(bdb)
        t1_data(bdb)
        bdb.execute('''
            CREATE POPULATION p1 FOR t1 (
                id IGNORE;
                label CATEGORICAL;
                age NUMERICAL;
                weight NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR p1_cc FOR p1 USING crosscat(
                label CATEGORICAL,
                age NUMERICAL,
                weight NUMERICAL
            )
        ''')
        pid = core.bayesdb_get_population(bdb, 'p1')
        assert core.bayesdb_variable_number(bdb, pid, None, 'label') == 1
        assert core.bayesdb_variable_number(bdb, pid, None, 'age') == 2
        assert core.bayesdb_variable_number(bdb, pid, None, 'weight') == 3
        gid = core.bayesdb_get_generator(bdb, pid, 'p1_cc')
        from bayeslite.metamodels.crosscat import crosscat_cc_colno
        assert crosscat_cc_colno(bdb, gid, 1) == 0
        assert crosscat_cc_colno(bdb, gid, 2) == 1
        assert crosscat_cc_colno(bdb, gid, 3) == 2
        bdb.execute('INITIALIZE 1 MODEL FOR p1_cc')
        bdb.execute('ANALYZE p1_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE PROBABILITY DENSITY OF age = 8'
            ' GIVEN (weight = 16)'
            ' BY p1').next()
        assert engine._last_Y == [(28, 2, 16)]
        bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next()
        bdb.execute("INFER age FROM p1 WHERE label = 'baz'").next()
        assert engine._last_Y == [(3, 0, 1), (3, 2, 32)]
        bdb.execute('SIMULATE weight FROM p1 GIVEN age = 8 LIMIT 1').next()
        assert engine._last_Y == [(28, 1, 8)]
        # Simulate with an unknown nominal value should throw an error.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute('SIMULATE weight FROM p1 GIVEN label = \'q\' LIMIT 1;')
Exemplo n.º 7
0
def t1_mp():
    crosscat = multiprocessing_crosscat()
    metamodel = CrosscatMetamodel(crosscat)
    return bayesdb_generator(
        bayesdb(metamodel=metamodel),
        't1',
        't1_cc',
        t1_schema,
        t1_data,
        columns=['label CATEGORICAL', 'age NUMERICAL', 'weight NUMERICAL'])
Exemplo n.º 8
0
def bayesdb(metamodel=None, **kwargs):
    if metamodel is None:
        crosscat = local_crosscat()
        metamodel = CrosscatMetamodel(crosscat)
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False, **kwargs)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    try:
        yield bdb
    finally:
        bdb.close()
Exemplo n.º 9
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb,
                                 'hospitals_full',
                                 'dha',
                                 overrides=[('name', 'key')])
        bayesdb_guess_population(bdb,
                                 'hospitals_sub',
                                 'dha',
                                 overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING crosscat (
                SUBSAMPLE(OFF)
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING crosscat (
                SUBSAMPLE(100)
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc'
                    ' FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE hospitals_sub'
                    ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND'
                    ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall()
        bdb.execute('INFER mdcr_spnd_amblnc FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        sql = '''
            SELECT sql_rowid FROM bayesdb_crosscat_subsample
                WHERE generator_id = ?
                ORDER BY cc_row_id ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full, ))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid, ))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
Exemplo n.º 10
0
def test_codebook_value_map():
    '''
    A categorical column in crosscat can only take on a fixed number of values
    v1, v2, ..., v3.  In this test, we have a categorical column called
    `city` which takes on values `RIO, LA, SF, DC` as specified in the codebook
    value map.

        INITIALIZE dummy table with only RIO and SF appearing in dataset
        ANALYZE dummy_cc
        INSERT rows with `city` names `LA` and `DC`
        ANALYZE dummy_cc
        SIMULATE specifying `city` = `LA` (throws KeyError)
    '''

    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        bayeslite.bayesdb_read_csv(bdb,
                                   'dummy',
                                   dummy_data,
                                   header=True,
                                   create=True)

        with tempfile.NamedTemporaryFile(prefix='bayeslite') as tempbook:
            with open(tempbook.name, 'w') as f:
                f.write(dummy_codebook)
            bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dummy',
                                                     tempbook.name)

        bdb.execute('''
            CREATE GENERATOR dummy_cc FOR dummy
                USING crosscat(
                    GUESS(*),
                    kerberos IGNORE,
                    age NUMERICAL,
                    city CATEGORICAL
                )
        ''')
        bdb.execute('INITIALIZE 10 MODELS FOR dummy_cc')
        bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT')
        bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = RIO LIMIT 5')
        bdb.sql_execute('''
            INSERT INTO dummy (kerberos, age, city) VALUES
                ('jackie', 18, 'LA'), ('rocker', 22, 'DC')
        ''')
        bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT')
        c = bdb.sql_execute('SELECT * FROM dummy')
        with pytest.raises(KeyError):
            bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = LA LIMIT 5')
Exemplo n.º 11
0
def test_impossible_nontransitive_dependency():
    # Test impossibility of non-transitive dependencies. While in the
    # general case, dependence is not transitive, crosscat assumes
    # transitive closure under dependency constraints.  The test is
    # valid since we are using a crosscat local engine.  Note that
    # transitivity under independence is not forced by crosscat.
    # Changing the behavior of CrossCat to deal with impossible
    # constraints (such as random dropout) will require updating this
    # test.
    data = [(0, 1, 0, 0), (1, 0, 0, 1)]

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,a,b,c)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row)

        # Create schema, we will force DEP(a b), DEP(b c), and IND(a c) which
        # is non-transitive.
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                a CATEGORICAL,
                b CATEGORICAL,
                c CATEGORICAL,
                DEPENDENT(a,b),
                DEPENDENT(b,c),
                INDEPENDENT(a,c)
            );
        '''

        # Creating the generator should succeed.
        bdb.execute(bql)

        # Error thrown when initializing since no initial state exists.
        # XXX Currently CrossCat throws a RuntimeError, we should fix
        # the CrossCat exception hierarchy.
        with pytest.raises(RuntimeError):
            bdb.execute('INITIALIZE 10 MODELS FOR bar')
Exemplo n.º 12
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname, ))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname, ))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
                                 builtin_metamodels=False)

    if args.jobs != 1:
        import crosscat.MultiprocessingEngine as ccme
        jobs = args.jobs if args.jobs > 0 else None
        crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs)
    else:
        import crosscat.LocalEngine as ccle
        crosscat = ccle.LocalEngine(seed=args.seed)
    metamodel = CrosscatMetamodel(crosscat)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                                          (str(path), ))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
Exemplo n.º 13
0
def test_simulate_drawconstraint():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR dha_cc FOR dha USING crosscat (
                GUESS(*),
                name KEY
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR dha_cc')
        bdb.execute('ANALYZE dha_cc FOR 1 ITERATION WAIT')
        samples = bdb.execute('''
            SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc
                GIVEN TTL_MDCR_SPND = 40000
                LIMIT 100
        ''').fetchall()
        assert [s[0] for s in samples] == [40000] * 100
Exemplo n.º 14
0
def test_legacy_models__ci_slow():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        bayeslite.bayesdb_load_legacy_models(bdb,
                                             'dha_cc',
                                             'dha',
                                             'crosscat',
                                             dha_models,
                                             create=True)
    with open(dha_csv, 'rU') as f:
        read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
    bayeslite.bayesdb_load_legacy_models(bdb,
                                         'dha_cc',
                                         'dha',
                                         'crosscat',
                                         dha_models,
                                         create=True)
    # Make sure guessing also works.
    bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))')
    bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook)
    # Need to be able to overwrite existing codebook.
    #
    # XXX Not sure this is the right API.  What if overwrite is a
    # mistake?
    bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook)
    bql = '''
        ESTIMATE name FROM dha_cc
            ORDER BY SIMILARITY TO (name = ?) DESC
            LIMIT 10
    '''
    with bdb.savepoint():
        assert bdb.execute(bql, ('Albany NY', )).fetchall() == [
            ('Albany NY', ),
            ('Scranton PA', ),
            ('United States US', ),
            ('Norfolk VA', ),
            ('Reading PA', ),
            ('Salisbury MD', ),
            ('Louisville KY', ),
            ('Cleveland OH', ),
            ('Covington KY', ),
            ('Akron OH', ),
        ]
    # Tickles an issue in case-folding of column names.
    bql = '''
        ESTIMATE name
            FROM dha_cc
            ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC
            LIMIT 10
    '''
    with bdb.savepoint():
        assert bdb.execute(bql).fetchall() == [
            ('McAllen TX', ),
            ('Worcester MA', ),
            ('Beaumont TX', ),
            ('Temple TX', ),
            ('Corpus Christi TX', ),
            ('Takoma Park MD', ),
            ('Kingsport TN', ),
            ('Bangor ME', ),
            ('Lebanon NH', ),
            ('Panama City FL', ),
        ]
Exemplo n.º 15
0
import pytest
import tempfile

import crosscat.LocalEngine

import bayeslite

import bayeslite.core as core

from bayeslite import bql_quote_name
from bayeslite.metamodels.crosscat import CrosscatMetamodel
from bayeslite.metamodels.iid_gaussian import StdNormalMetamodel

examples = {
    'crosscat': (
        lambda: CrosscatMetamodel(crosscat.LocalEngine.LocalEngine(seed=0)),
        't',
        'CREATE TABLE t(x NUMERIC, y CYCLIC, z CATEGORICAL)',
        'INSERT INTO t (x, y, z) VALUES (?, ?, ?)',
        [
            (0, 1.57, 'foo'),
            (1.83, 3.141, 'bar'),
            (1.82, 3.140, 'bar'),
            (-1, 6.28, 'foo'),
        ],
        'p',
        'p_cc',
        'CREATE POPULATION p FOR t'
        '(x NUMERICAL; y CYCLIC; z CATEGORICAL)',
        'CREATE GENERATOR p_cc FOR p USING crosscat()',
        'CREATE GENERATOR p_cc FOR p USING crosscat(DEPENDENT)',
Exemplo n.º 16
0
def test_complex_dependencies__ci_slow():
    # Parameterize number of rows in synthetic dataset.
    n_rows = 250

    # Add an id column to ensure generator and cc colnos are different.
    ids = np.arange(n_rows)

    # Create real-valued data, such that DEP(x,y), DEP(y,z), and IND(x,z)
    mean = [4, -2, -11]
    cov = [[3.0, 0.7, 0.0], [0.7, 4.0, 0.6], [0.0, 0.6, 2.0]]
    numerical_data = np.random.multivariate_normal(mean, cov, size=n_rows)
    x, y, z = numerical_data[:, 0], numerical_data[:, 1], numerical_data[:, 2]

    # Create categorical data v, highly dependent on x.
    bins = [np.percentile(x, p) for p in xrange(0, 101, 10)]
    v = np.digitize(x, bins)

    # Create categorical data, independent of all other columns.
    w = np.random.choice(range(8), size=n_rows)

    data = np.vstack((ids, x, y, z, w, v)).T

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,x,y,z,v,w)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?,?,?)', row)

        # Create schema, we will force  IND(x y), IND(x v), and DEP(z v w).
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                x NUMERICAL,
                y NUMERICAL,
                z NUMERICAL,
                v CATEGORICAL,
                w CATEGORICAL,
                INDEPENDENT(x, y),
                INDEPENDENT(x, v),
                DEPENDENT(z, v, w)
            );
        '''
        bdb.execute(bql)

        # Prepare the checker function.
        def check_dependencies():
            bql = '''
                ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF bar
            '''
            for _id, col1, col2, dep in bdb.execute(bql):
                # test IND(x y)
                if (col1, col2) in [('x', 'y'), ('y', 'x')]:
                    assert dep == 0
                    continue
                # test IND(x v)
                if (col1, col2) in [('x', 'v'), ('v', 'x')]:
                    assert dep == 0
                    continue
                # test DEP(z v)
                if (col1, col2) in [('z', 'v'), ('v', 'z')]:
                    assert dep == 1
                    continue
                # test DEP(z w)
                if (col1, col2) in [('z', 'w'), ('w', 'z')]:
                    assert dep == 1
                    continue

        # Test dependency pre-analysis.
        bdb.execute('INITIALIZE 10 MODELS FOR bar')
        check_dependencies()

        # Test dependency post-analysis.
        bdb.execute('ANALYZE bar for 10 ITERATION WAIT')
        check_dependencies()
Exemplo n.º 17
0
def new_cc_metamodel(prng):
    return CrosscatMetamodel(CrosscatLocalEngine(seed=prng.randint(0, 2**31)))
Exemplo n.º 18
0
    'BayesDB',
    'BayesDBException',
    'BayesDBTxnError',
    'bayesdb_deregister_metamodel',
    'bayesdb_load_codebook_csv_file',
    'bayesdb_load_legacy_models',
    'bayesdb_open',
    'bayesdb_read_csv',
    'bayesdb_read_csv_file',
    'bayesdb_register_metamodel',
    'bql_quote_name',
    'IBayesDBMetamodel',
    'IBayesDBTracer',
    '__version__',
]

from bayeslite.metamodels.crosscat import CrosscatMetamodel
from crosscat.LocalEngine import LocalEngine as CrosscatLocalEngine

bayesdb_builtin_metamodel(CrosscatMetamodel(CrosscatLocalEngine(seed=0)))

import bayeslite.remote
import os
if not 'BAYESDB_DISABLE_VERSION_CHECK' in os.environ:
    bayeslite.remote.version_check()

# Notebooks should contain comment lines documenting this behavior and
# offering a solution, like so:
# Please keep BayesDB up to date. To disable remote version checking:
# import os; os.environ['BAYESDB_DISABLE_VERSION_CHECK'] = '1'
Exemplo n.º 19
0
def test_correlation():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)
        bdb.sql_execute('CREATE TABLE u(id, c0, c1, n0, n1, r0, r1)')
        bdb.execute('''
            CREATE GENERATOR u_cc FOR u USING crosscat (
                c0 CATEGORICAL,
                c1 CATEGORICAL,
                n0 NUMERICAL,
                n1 NUMERICAL,
                r0 CYCLIC,
                r1 CYCLIC,
            )
        ''')
        assert bdb.execute('ESTIMATE CORRELATION, CORRELATION PVALUE'
                ' FROM PAIRWISE COLUMNS OF u_cc'
                ' WHERE name0 < name1'
                ' ORDER BY name0, name1').fetchall() == \
            [
                (1, 'c0', 'c1', None, None),
                (1, 'c0', 'n0', None, None),
                (1, 'c0', 'n1', None, None),
                (1, 'c0', 'r0', None, None),
                (1, 'c0', 'r1', None, None),
                (1, 'c1', 'n0', None, None),
                (1, 'c1', 'n1', None, None),
                (1, 'c1', 'r0', None, None),
                (1, 'c1', 'r1', None, None),
                (1, 'n0', 'n1', None, None),
                (1, 'n0', 'r0', None, None),
                (1, 'n0', 'r1', None, None),
                (1, 'n1', 'r0', None, None),
                (1, 'n1', 'r1', None, None),
                (1, 'r0', 'r1', None, None),
            ]
        bdb.sql_execute('CREATE TABLE t'
                        '(id, c0, c1, cx, cy, n0, n1, nc, nl, nx, ny)')
        data = [
            ('foo', 'quagga', 'x', 'y', 0, -1, +1, 1, 0, 13),
            ('bar', 'eland', 'x', 'y', 87, -2, -1, 2, 0, 13),
            ('baz', 'caribou', 'x', 'y', 92.1, -3, +1, 3, 0, 13),
        ] * 10
        for i, row in enumerate(data):
            row = (i + 1, ) + row
            bdb.sql_execute('INSERT INTO t VALUES (?,?,?,?,?,?,?,?,?,?,?)',
                            row)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                c0 CATEGORICAL,
                c1 CATEGORICAL,
                cx CATEGORICAL,
                cy CATEGORICAL,
                n0 NUMERICAL,
                n1 NUMERICAL,
                nc NUMERICAL,
                nl NUMERICAL,
                nx NUMERICAL,
                ny NUMERICAL
            )
        ''')
        result = bdb.execute('ESTIMATE CORRELATION, CORRELATION PVALUE'
                             ' FROM PAIRWISE COLUMNS OF t_cc'
                             ' WHERE name0 < name1'
                             ' ORDER BY name0, name1').fetchall()
        expected = [
            (2, 'c0', 'c1', 1., 2.900863120340436e-12),
            (2, 'c0', 'cx', None, None),
            (2, 'c0', 'cy', None, None),
            (2, 'c0', 'n0', 1., 0.),
            (2, 'c0', 'n1', 1., 0.),
            (2, 'c0', 'nc', 1., 0.),
            (2, 'c0', 'nl', 1., 0.),
            (2, 'c0', 'nx', None, None),
            (2, 'c0', 'ny', None, None),
            (2, 'c1', 'cx', None, None),
            (2, 'c1', 'cy', None, None),
            (2, 'c1', 'n0', 1., 0.),
            (2, 'c1', 'n1', 1., 0.),
            (2, 'c1', 'nc', 1., 0.),
            (2, 'c1', 'nl', 1., 0.),
            (2, 'c1', 'nx', None, None),
            (2, 'c1', 'ny', None, None),
            (2, 'cx', 'cy', None, None),
            (2, 'cx', 'n0', None, None),
            (2, 'cx', 'n1', None, None),
            (2, 'cx', 'nc', None, None),
            (2, 'cx', 'nl', None, None),
            (2, 'cx', 'nx', None, None),
            (2, 'cx', 'ny', None, None),
            (2, 'cy', 'n0', None, None),
            (2, 'cy', 'n1', None, None),
            (2, 'cy', 'nc', None, None),
            (2, 'cy', 'nl', None, None),
            (2, 'cy', 'nx', None, None),
            (2, 'cy', 'ny', None, None),
            (2, 'n0', 'n1', 0.7913965673596881, 0.),
            (2, 'n0', 'nc', 0.20860343264031175, 0.0111758925135),
            (2, 'n0', 'nl', 0.7913965673596881, 0.),
            (2, 'n0', 'nx', None, None),
            (2, 'n0', 'ny', None, None),
            (2, 'n1', 'nc', 0., 1.),
            (2, 'n1', 'nl', 1., 0.),
            (2, 'n1', 'nx', None, None),
            (2, 'n1', 'ny', None, None),
            (2, 'nc', 'nl', 0., 1.),
            (2, 'nc', 'nx', None, None),
            (2, 'nc', 'ny', None, None),
            (2, 'nl', 'nx', None, None),
            (2, 'nl', 'ny', None, None),
            (2, 'nx', 'ny', None, None),
        ]
    for expected_item, observed_item in zip(expected, result):
        (xpd_genid, xpd_name0, xpd_name1, xpd_corr, xpd_corr_p) = expected_item
        (obs_genid, obs_name0, obs_name1, obs_corr, obs_corr_p) = observed_item
        assert xpd_genid == obs_genid
        assert xpd_name0 == obs_name0
        assert xpd_name1 == obs_name1
        assert xpd_corr == obs_corr or relerr(xpd_corr, obs_corr) < 1e-10
        assert (xpd_corr_p == obs_corr_p
                or relerr(xpd_corr_p, obs_corr_p) < 1e-1)