Пример #1
0
def cgpm_smoke_bdb():
    with bayesdb_open(':memory:', builtin_metamodels=False) as bdb:
        registry = {
            'piecewise': PieceWise,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry,
                                                       multiprocess=0))

        bdb.sql_execute('CREATE TABLE t (Output, cat, Input)')
        for i in xrange(3):
            for j in xrange(3):
                for k in xrange(3):
                    output = i + j / (k + 1)
                    cat = -1 if (i + j * k) % 2 else +1
                    input = (i * j - k)**2
                    if i % 2:
                        output = None
                    if j % 2:
                        cat = None
                    if k % 2:
                        input = None
                    bdb.sql_execute(
                        '''
                        INSERT INTO t (output, cat, input) VALUES (?, ?, ?)
                    ''', (output, cat, input))

        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                MODEL output, input AS NUMERICAL;
                MODEL cat AS CATEGORICAL
            )
        ''')

        yield bdb
Пример #2
0
def cgpm_dummy_satellites_pop_bdb():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            create population satellites for satellites_ucs with schema(
                model apogee as numerical;
                model class_of_orbit as categorical;
                model country_of_operator as categorical;
                model launch_mass as numerical;
                model perigee as numerical;
                model period as numerical
            )
        ''')
        metamodel = CGPM_Metamodel(dict(), multiprocess=0)
        bayesdb_register_metamodel(bdb, metamodel)
        yield bdb
Пример #3
0
def test_bad_analyze_vars():
    try:
        from cgpm.regressions.linreg import LinearRegression
    except ImportError:
        pytest.skip('no sklearn')
        return
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry))
        bdb.execute('''
            CREATE METAMODEL satellites_cgpm FOR satellites USING cgpm
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION WAIT ()')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION WAIT')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION WAIT (
                    VARIABLES period, perige
                )
            ''')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION WAIT (
                    SKIP period, perige
                )
            ''')
Пример #4
0
def test_regress_bonanza__ci_integration():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_metamodel(
            bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS NOMINAL;
                MODEL country_of_operator AS NOMINAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE METAMODEL m FOR satellites WITH BASELINE crosscat;
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')

        def check_regression_variables(results, numericals, nominals):
            seen = set()
            for r in results:
                assert len(r) == 2
                variable = r[0]
                assert variable not in seen
                assert variable in numericals or \
                    any(variable.startswith('%s_dum_' % (nominal,))
                        for nominal in nominals)
                seen.add(variable)

        # Regression on 1 numerical variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites;
        ''').fetchall()
        assert len(results) == 2
        check_regression_variables(results, ['intercept', 'perigee'], [])

        # Regression on 1 nominal variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept'], ['country_of_operator'])

        # Regression on 1 nominal + 1 numerical variable.
        bdb.execute('''
            REGRESS apogee GIVEN (perigee, country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept', 'perigee'], ['country_of_operator'])

        # Regression on all variables.
        results = bdb.execute('''
            REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites;
        ''', (3,)).fetchall()
        check_regression_variables(
            results,
            ['intercept', 'perigee', 'launch_mass', 'period',],
            ['country_of_operator', 'class_of_orbit',],
        )

        # Regression on column selector subexpression with a binding.
        results = bdb.execute('''
            REGRESS apogee GIVEN (
                satellites.(
                    ESTIMATE * FROM VARIABLES OF satellites
                    ORDER BY dependence probability with apogee DESC
                    LIMIT ?
                )
            )
            USING 12 SAMPLES BY satellites MODELLED BY m USING MODEL 1;
        ''', (3,)).fetchall()

        cursor = bdb.execute('''
            ESTIMATE * FROM VARIABLES OF satellites
                ORDER BY dependence probability with apogee DESC
                LIMIT ?
        ''', (3,)).fetchall()
        top_variables = [c[0] for c in cursor]
        nominals = [
            var for var in top_variables
            if var in ['country_of_operator', 'class_of_orbit',]
        ]
        numericals = [var for var in top_variables if var not in nominals]
        check_regression_variables(
            results, numericals + ['intercept'], nominals)

        # Cannot mix * with other variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                REGRESS apogee GIVEN (*, class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()

        # Not enough data for regression, 1 unique nominal variable.
        with pytest.raises(ValueError):
            bdb.execute('''
                REGRESS apogee GIVEN (class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()
Пример #5
0
def test_analysis_subproblems_basic():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE ANALYSIS SCHEMA g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 4 ANALYSES FOR g0')

        # Test each subproblem individually except for variable hyperparameters.
        for optimized in [
                '',
                'OPTIMIZED;',
        ]:
            for subproblem in [
                    'variable clustering',
                    'variable clustering concentration',
                    'row clustering',
                    'row clustering concentration',
            ]:
                bdb.execute('''
                    ANALYZE g0 ANALYSES 0,1 FOR 4 ITERATION WAIT(
                        SUBPROBLEM %s;
                        %s
                    );
                ''' % (subproblem, optimized))

        # Test variable hyperparameters.
        bdb.execute('''
            ANALYZE g0 FOR 1 ITERATION WAIT (
                VARIABLES period, launch_mass;
                SUBPROBLEM variable hyperparameters;
            )
        ''')
        with pytest.raises(BQLError):
            # OPTIMIZED backend does not support variable hyperparameters.
            bdb.execute('''
                ANALYZE g0 FOR 1 SECONDS WAIT (
                    SUBPROBLEM variable hyperparameters;
                    OPTIMIZED;
                )
            ''')

        # Test rows.
        generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0')
        cursor = bdb.execute(
            '''
            SELECT table_rowid FROM  bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, ))
        subsample_rows = [c[0] for c in cursor]
        bad_rows = [i for i in xrange(20) if i not in subsample_rows]
        for optimized in ['', 'OPTIMIZED;']:
            bdb.execute('''
                ANALYZE g0 ANALYSIS 3 FOR 1 ITERATION WAIT (
                    VARIABLES class_of_orbit;
                    ROWS %s;
                    SUBPROBLEMS (
                        row clustering,
                        row clustering concentration
                    );
                    %s
            )
            ''' % (','.join(map(str, subsample_rows)), optimized))
            with pytest.raises(BQLError):
                # Fail on rows not in the population or subsample.
                bdb.execute('''
                    ANALYZE g0 ANALYSIS 3 FOR 1 ITERATION WAIT (
                        VARIABLES class_of_orbit;
                        ROWS %s;
                        SUBPROBLEMS (
                            row clustering,
                            row clustering concentration
                        );
                        %s
                )
                ''' % (','.join(map(str, bad_rows)), optimized))
Пример #6
0
    'bayesdb_open',
    'bayesdb_read_csv',
    'bayesdb_read_csv_file',
    'bayesdb_register_metamodel',
    'bayesdb_upgrade_schema',
    'bql_quote_name',
    'IBayesDBMetamodel',
    'IBayesDBTracer',
]

# Register crosscat as a builtin metamodel.
from bayeslite.metamodels.crosscat import CrosscatMetamodel
from crosscat.LocalEngine import LocalEngine as CrosscatLocalEngine

bayesdb_builtin_metamodel(CrosscatMetamodel(CrosscatLocalEngine(seed=0)))

# Register cgpm as a builtin metamodel.
from bayeslite.metamodels.cgpm_metamodel import CGPM_Metamodel

bayesdb_builtin_metamodel(CGPM_Metamodel({}, multiprocess=True))

import bayeslite.remote
import os
if not 'BAYESDB_DISABLE_VERSION_CHECK' in os.environ:
    bayeslite.remote.version_check()

# Notebooks should contain comment lines documenting this behavior and
# offering a solution, like so:
# Please keep BayesDB up to date. To disable remote version checking:
# import os; os.environ['BAYESDB_DISABLE_VERSION_CHECK'] = '1'
Пример #7
0
def test_predictive_relevance():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(cgpm_registry=dict()))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA (
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bdb.execute('CREATE METAMODEL m FOR satellites;')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')
        bdb.execute('ANALYZE m FOR 25 ITERATION WAIT (OPTIMIZED);')

        # Check self-similarites, and also provide coverage of bindings.
        rowids = bdb.execute('SELECT OID from satellites_ucs;').fetchall()
        for rowid in rowids[:4]:
            cursor = bdb.execute(
                '''
                ESTIMATE PREDICTIVE RELEVANCE
                    TO EXISTING ROWS (rowid = ?)
                    IN THE CONTEXT OF "period"
                FROM satellites
                WHERE rowid = ?
            ''', (
                    1,
                    1,
                ))
            assert next(cursor)[0] == 1.

        # A full extravaganza query, using FROM (as a 1-row).
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                TO EXISTING ROWS
                    (country_of_operator = 'Russia' AND period < 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (perigee=1.0, launch_mass=120),
                    (country_of_operator='Bulgaria', perigee=2.0))
                IN THE CONTEXT OF "country_of_operator"
            FROM satellites
            LIMIT 5
        ''').fetchall()
        assert len(cursor) == 5
        assert all(0 <= c[0] <= 1 for c in cursor)

        # A full extravaganza query, using BY (as a constant).
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS
                    (country_of_operator = 'Russia' AND period < 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0),
                    (country_of_operator='Bulgaria'))
                IN THE CONTEXT OF "country_of_operator"
            BY satellites
        ''').fetchall()
        assert len(cursor) == 1
        assert all(0 <= c[0] <= 1 for c in cursor)

        # Hypothetical satellite with negative perigee should not be similar,
        # and use a binding to just ensure that they work.
        cursor = bdb.execute(
            '''
            ESTIMATE PREDICTIVE RELEVANCE
                TO HYPOTHETICAL ROWS WITH VALUES (
                    (perigee = ?))
                IN THE CONTEXT OF "perigee"
            FROM satellites
            LIMIT 5
        ''', (-10000, )).fetchall()
        assert len(cursor) == 5
        assert all(np.allclose(c[0], 0) for c in cursor)

        # No matching target OF row.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid < 0) TO EXISTING ROWS (rowid = 10)
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Unknown CONTEXT variable "banana".
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 1) TO EXISTING ROWS (rowid = 2)
                    IN THE CONTEXT OF "banana"
                BY satellites
            ''')

        # No matching EXISTING ROW.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 10) TO EXISTING ROWS (rowid < 0)
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Unknown categorical values 'Mongolia' in HYPOTHETICAL ROWS.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 10)
                    TO HYPOTHETICAL ROWS WITH VALUES (
                        (country_of_operator='Mongolia'),
                        (country_of_operator='Bulgaria', perigee=2.0))
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Create a new row.
        bdb.sql_execute('''
            INSERT INTO satellites_ucs
            (apogee, launch_mass) VALUES (12.128, 12.128)
        ''')

        # TARGET ROW not yet incorporated should return nan.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (apogee = 12.128)
                TO HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0))
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is None

        # EXISTING ROW not yet incorporated should return nan, since there is
        # no hypothetical.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128)
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is None

        # Although apogee = 12.128 is EXISTING but not incorporated, there are
        # other EXISTING ROWS with apogee > 0, so we should still get a result.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128 OR apogee > 0)
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is not None

        # Although apogee = 12.128 is EXISTING but not incorporated, there are
        # other HYPOTHETICAL ROWS, so we should still get a result.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128 OR apogee > 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0),
                    (country_of_operator='Bulgaria'))
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is not None
Пример #8
0
def test_initialize_with_all_nulls():
    # This test ensures that trying to initialize a CGPM metamodel with any
    # (manifest) column of all null variables will crash.
    # Initializing an overriden column with all null variables should not
    # be a problem in general, so we test this case as well.

    with bayesdb_open(':memory:', builtin_metamodels=False) as bdb:
        registry = {
            'barebones': BareBonesCGpm,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry,
                                                       multiprocess=0))
        # Create table with all missing values for a.
        bdb.sql_execute('''
            CREATE TABLE t (a REAL, b REAL, c REAL);
        ''')
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 3))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -2, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -5, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, 2, 3))

        # Fail when a is numerical and modeled by crosscat.
        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                MODEL a, b, c AS NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE METAMODEL m FOR p WITH BASELINE crosscat;
        ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                INITIALIZE 2 MODELS FOR m;
            ''')

        # Fail when a is nominal and modeled by crosscat.
        bdb.execute('''
            CREATE POPULATION p2 FOR t WITH SCHEMA(
                MODEL a AS NOMINAL;
                MODEL b, c AS NUMERICAL
            )
        ''')
        bdb.execute('CREATE METAMODEL m2 FOR p2 WITH BASELINE crosscat;')
        with pytest.raises(BQLError):
            bdb.execute('INITIALIZE 2 MODELS FOR m2;')

        # Succeed when a is ignored.
        bdb.execute('''
            CREATE POPULATION p3 FOR t WITH SCHEMA(
                IGNORE a;
                MODEL b, c AS NUMERICAL
            )
        ''')
        bdb.execute('CREATE METAMODEL m3 FOR p3 WITH BASELINE crosscat;')
        bdb.execute('INITIALIZE 2 MODELS FOR m3;')

        # Succeed when a is numerical overriden using a dummy CGPM.
        bdb.execute('''
            CREATE METAMODEL m4 FOR p WITH BASELINE crosscat(
                OVERRIDE MODEL FOR a GIVEN b USING barebones
            )
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m4;')
        bdb.execute('ANALYZE m4 FOR 1 ITERATION WAIT;')
Пример #9
0
def test_output_stattypes():
    try:
        from cgpm.factor.factor import FactorAnalysis
    except ImportError:
        pytest.skip('no sklearn')
        return
    with cgpm_dummy_satellites_bdb() as bdb:
        # Missing policy for class_of_orbit, perigee, period
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    MODEL apogee, launch_mass AS NUMERICAL;
                    MODEL country_of_operator AS CATEGORICAL
                )
            ''')
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                IGNORE class_of_orbit, perigee, period;
                MODEL apogee, launch_mass AS NUMERICAL;
                MODEL country_of_operator AS CATEGORICAL
            )
        ''')
        registry = {
            'factor_analysis': FactorAnalysis,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry))
        # Creating factor analysis with categorical manifest should crash.
        bdb.execute('''
            CREATE METAMODEL satellites_g0 FOR satellites(
                OVERRIDE MODEL FOR apogee, country_of_operator
                AND EXPOSE pc_1 NUMERICAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0')
        with pytest.raises(BQLError):
            # Duplicate pc_2 in LATENT and EXPOSE.
            bdb.execute('''
                CREATE METAMODEL satellites_g1 FOR satellites(
                    LATENT pc_2 CATEGORICAL,
                    OVERRIDE GENERATIVE MODEL FOR
                        apogee, launch_mass
                    AND EXPOSE pc_2 CATEGORICAL
                    USING factor_analysis(L=1)
                )
            ''')
        # Creating factor analysis with categorical latent should crash.
        bdb.execute('''
            CREATE METAMODEL satellites_g1 FOR satellites(
                OVERRIDE GENERATIVE MODEL FOR
                    apogee, launch_mass
                AND EXPOSE pc_2 CATEGORICAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1')
        # Creating factor analysis with all numerical should be ok.
        bdb.execute('''
            CREATE METAMODEL satellites_g2 FOR satellites USING cgpm(
                LATENT pc_3 NUMERICAL;

                OVERRIDE MODEL FOR apogee, launch_mass, pc_3, pc_4
                USING factor_analysis(L=2);

                LATENT pc_4 NUMERICAL
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2')
        bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION WAIT;')
        # Cannot transitioned baseline and foreign using timed analyis.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE satellites_g2 FOR 2 SECONDS WAIT (
                    VARIABLES country_of_operator, apogee, launch_mass, pc_3);
            ''')
        bdb.execute('''
            ANALYZE satellites_g2 FOR 1 ITERATION WAIT (
                VARIABLES apogee, launch_mass);
        ''')
        # Dependence probability of manifest with latent.
        cursor = bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY OF apogee WITH pc_3
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        assert cursor[0][0] == 1.
        # Dependence probability of latent with latent.
        cursor = bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY OF pc_3 WITH pc_4
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        assert cursor[0][0] == 1.
        # Mutual information of latent with manifest.
        cursor = bdb.execute('''
            ESTIMATE MUTUAL INFORMATION OF apogee WITH pc_4 USING 1 SAMPLES
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        # Mutual information of latent with latent.
        cursor = bdb.execute('''
            ESTIMATE MUTUAL INFORMATION OF pc_3 WITH pc_4 USING 1 SAMPLES
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
Пример #10
0
def test_unknown_stattype():
    try:
        from cgpm.regressions.linreg import LinearRegression
    except ImportError:
        pytest.skip('no sklearn')
        return
    with cgpm_dummy_satellites_bdb() as bdb:
        # Add a column called relaunches, sum of apogee and perigee.
        bdb.sql_execute('ALTER TABLE satellites_ucs ADD COLUMN relaunches')
        n_rows = bdb.sql_execute('''
            SELECT COUNT(*) FROM satellites_ucs
        ''').next()[0]
        for rowid in xrange(n_rows):
            bdb.sql_execute(
                '''
                UPDATE satellites_ucs
                    SET relaunches = (SELECT apogee + perigee)
                    WHERE _rowid_ = ?
            ''', (rowid + 1, ))
        # Nobody will ever create a QUAGGA statistical type!
        with pytest.raises(BQLError):
            # No such statistical type at the moment.
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    MODEL apogee, perigee, launch_mass, period
                    AS NUMERICAL;

                    MODEL class_of_orbit, country_of_operator
                    AS NOMINAL;

                    MODEL relaunches
                    AS QUAGGA
                )
            ''')
        # Invent the statistical type.
        bdb.sql_execute('INSERT INTO bayesdb_stattype VALUES (?)',
                        ('quagga', ))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee, perigee, launch_mass, period
                AS NUMERICAL;

                MODEL class_of_orbit, country_of_operator
                AS NOMINAL;

                MODEL relaunches
                AS QUAGGA
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry))
        with pytest.raises(BQLError):
            # Can't model QUAGGA by default.
            bdb.execute('CREATE METAMODEL g0 FOR satellites USING cgpm')
        with pytest.raises(BQLError):
            # Can't model QUAGGA as input.
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR relaunches GIVEN apogee USING linreg;
                    OVERRIDE MODEL FOR period GIVEN relaunches USING linreg
                )
            ''')
        # Can model QUAGGA with an explicit distribution family.
        bdb.execute('''
            CREATE METAMODEL g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR relaunches TO POISSON
            )
        ''')
        bdb.execute('''
            CREATE METAMODEL g1 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR relaunches TO POISSON;
                OVERRIDE MODEL FOR period GIVEN relaunches USING linreg
            )
        ''')
Пример #11
0
def test_cgpm_kepler():
    try:
        from cgpm.regressions.linreg import LinearRegression
    except ImportError:
        pytest.skip('no sklearn')
        return
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bdb.execute('''
            ESTIMATE CORRELATION from PAIRWISE VARIABLES OF satellites
        ''').fetchall()
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry,
                                                       multiprocess=0))
        bdb.execute('''
            CREATE METAMODEL g0 FOR satellites USING cgpm (
                OVERRIDE GENERATIVE MODEL FOR period
                GIVEN apogee, perigee
                USING linreg
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR g0')
        c = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual')
        n = c.fetchvalue()
        # Another generator: exponential launch mass instead of normal.
        bdb.execute('''
            CREATE METAMODEL g1 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR launch_mass TO EXPONENTIAL;
                OVERRIDE MODEL FOR period GIVEN apogee, perigee
                    USING kepler(quagga = eland);
                SUBSAMPLE 20
            )
        ''')
        c_ = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual')
        n_ = c_.fetchvalue()
        assert n_ - n == 20
        bdb.execute('INITIALIZE 1 MODEL IF NOT EXISTS FOR g1')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT (VARIABLES period)')
        bdb.execute('ANALYZE g1 FOR 1 ITERATION WAIT')
        bdb.execute('ANALYZE g1 FOR 1 ITERATION WAIT (VARIABLES period)')
        # OPTIMIZED is ignored because period is a foreign variable.
        bdb.execute('''
            ANALYZE g1 FOR 1 ITERATION WAIT (OPTIMIZED; VARIABLES period)
        ''')
        # This should fail since we have a SET CATEGORY MODEL which is not
        # compatible with lovecat. The ValueError is from cgpm not bayeslite.
        with pytest.raises(ValueError):
            bdb.execute('''
                ANALYZE g1 FOR 1 ITERATION WAIT
                    (OPTIMIZED; VARIABLES launch_mass)
            ''')
        # Cannot use timed analysis with mixed variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g1 FOR 5 SECONDS WAIT (VARIABLES period, apogee)
            ''')
        # Cannot use timed analysis with mixed variables (period by SKIP).
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g1 FOR 5 SECONDS WAIT (SKIP apogee)
            ''')
        # OK to use iteration analysis with mixed values.
        bdb.execute('''
                ANALYZE g1 FOR 1 ITERATION WAIT (VARIABLES period, apogee)
            ''')
        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                FROM PAIRWISE VARIABLES OF satellites
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()
        bdb.execute('''
            SIMULATE apogee, perigee, period FROM satellites LIMIT 100
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                PREDICT apogee
                    CONFIDENCE apogee_confidence
                    USING 5 SAMPLES
            FROM satellites LIMIT 2
        ''').fetchall()
        results = bdb.execute('''
            INFER EXPLICIT
                PREDICT class_of_orbit
                    CONFIDENCE class_of_orbit_confidence
            FROM satellites LIMIT 2
        ''').fetchall()
        assert len(results[0]) == 2
        assert isinstance(results[0][0], unicode)
        assert isinstance(results[0][1], float)
        # No CONFIDENCE specified.
        results = bdb.execute('''
            INFER EXPLICIT PREDICT class_of_orbit USING 2 SAMPLES
            FROM satellites LIMIT 2
        ''').fetchall()
        assert len(results[0]) == 1
        assert isinstance(results[0][0], unicode)
        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP METAMODEL g0')
        bdb.execute('DROP METAMODEL g1')
Пример #12
0
def test_using_modelnos():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE ANALYSIS SCHEMA g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 2 ANALYSES FOR g0')
        # Predictive probability results should be different for modelnos 0, 1.

        # Crash test simulate.
        bdb.execute('''
            SIMULATE apogee, class_of_orbit
            FROM satellites
            MODELED BY g0
            USING ANALYSIS 0-1
            LIMIT 10
        ''')
        # Crash test infer explicit.
        bdb.execute('''
            INFER EXPLICIT PREDICT period, perigee
            FROM satellites
            MODELED BY g0
            USING ANALYSIS 0
            LIMIT 2
        ''')
        # Crash test dependence probability BY.
        c = bdb.execute('''
            ESTIMATE
                DEPENDENCE PROBABILITY OF launch_mass WITH period
            BY satellites
            MODELED BY g0
            USING ANALYSIS 0
        ''')
        assert cursor_value(c) in [0, 1]
        # Crash test dependence probability pairwise.
        cursor = bdb.execute('''
            ESTIMATE
                DEPENDENCE PROBABILITY
            FROM PAIRWISE VARIABLES OF satellites
            MODELED BY g0
            USING ANALYSIS 1
        ''')
        for d in cursor:
            assert d[0] in [0, 1]
        # Crash test mutual information 1row.
        bdb.execute('''
            ESTIMATE
                MUTUAL INFORMATION WITH (period) USING 1 SAMPLES
            FROM VARIABLES OF satellites
            USING ANALYSIS 0
        ''').fetchall()
        # Test analyze on per-model basis.
        bdb.execute('''
            ANALYZE g0
            ANALYSIS 0
            FOR 1 ITERATION
            CHECKPOINT 1 ITERATION
            WAIT;
        ''')
        engine = bdb.metamodels['cgpm']._engine(bdb, 1)
        assert len(engine.states[0].diagnostics['logscore']) == 1
        assert len(engine.states[1].diagnostics['logscore']) == 0
        bdb.execute('''
            ANALYZE g0
            ANALYSIS 1
            FOR 4 ITERATION
            CHECKPOINT 1 ITERATION
            WAIT (OPTIMIZED);
        ''')
        assert len(engine.states[0].diagnostics['logscore']) == 1
        assert len(engine.states[1].diagnostics['logscore']) == 4
        # Some errors with bad modelnos.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g0 ANALYSIS 0-3 FOR 4 ITERATION WAIT;
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                SIMULATE apogee FROM satellites USING ANALYSIS 25 LIMIT 10;
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
                USING MODELS 0-8 LIMIT 2;
            ''')
Пример #13
0
def test_add_drop_models():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION p FOR satellites_ucs WITH SCHEMA(
                GUESS STATTYPES FOR (*);
            )
        ''')
        bdb.execute('CREATE METAMODEL m FOR p (SUBSAMPLE 10);')

        # Retrieve id for testing.
        population_id = bayesdb_get_population(bdb, 'p')
        generator_id = bayesdb_get_generator(bdb, population_id, 'm')

        def check_modelno_mapping(lookup):
            pairs = bdb.sql_execute(
                '''
                SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno
                WHERE generator_id = ?
            ''', (generator_id, ))
            for pair in pairs:
                assert lookup[pair[0]] == pair[1]
                del lookup[pair[0]]
            assert len(lookup) == 0

        # Initialize some models.
        bdb.execute('INITIALIZE 16 MODELS FOR m')
        # Assert identity mapping initially.
        check_modelno_mapping({i: i for i in xrange(16)})

        bdb.execute('ANALYZE m FOR 1 ITERATION WAIT (QUIET);')

        # Drop some models.
        bdb.execute('DROP MODELS 1, 8-12, 14 FROM m')
        # Assert cgpm models are contiguous while bayesdb models are not, with
        # the mapping preserving the strict order.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
        })

        # Run some analysis again.
        bdb.execute('ANALYZE m FOR 1 ITERATION WAIT (OPTIMIZED; QUIET);')

        # Initialize 14 models if not existing.
        bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m')
        # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note
        # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14
        # MODELS in total will exist after the query, rather it will initialize
        # any non-existing modelnos with index 0-13, and any modelnos > 14
        # (modelno 15 in this test case) are untouched.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
            # Recreated models.
            1: 9,
            8: 10,
            9: 11,
            10: 12,
            11: 13,
            12: 14,
        })

        # Drop some more models, add them back with some more, and confirm
        # arithmetic and ordering remains correct.
        bdb.execute('DROP MODELS 0-1 FROM m')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
        })
        bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
            # Re-recreated models.
            0: 13,
            1: 14,
            # New models.
            14: 15,
            16: 16,
            17: 17,
            18: 18,
            19: 19,
        })

        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODELS 20-50 FROM m')
        # Drop all models.
        bdb.execute('DROP MODELS FROM m;')
        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODEL 0 FROM m')
        # Assert cgpm mapping is cleared.
        cursor = bdb.sql_execute(
            '''
            SELECT COUNT(*) FROM bayesdb_cgpm_modelno
            WHERE generator_id = ?
        ''', (generator_id, ))
        assert cursor_value(cursor) == 0
Пример #14
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_metamodels=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(
                        0, len(countries))]
                    name = 'sat-%s-%d' % (country,
                                          bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute(
                        '''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit CATEGORICAL;
                country_of_operator CATEGORICAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Metamodel(cgpm_registry)
        bayesdb_register_metamodel(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE METAMODEL g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_generator_column_numbers(bdb, generator_id) == \
            [-2, -1, 1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, None) == \
            [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(
                bdb, population_id, generator_id) == \
            [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration WAIT (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration WAIT (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration WAIT (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration WAIT (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION WAIT (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration WAIT (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELLED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELLED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELLED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELLED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELLED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELLED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELLED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP METAMODEL g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')
Пример #15
0
def test_output_stattypes():
    try:
        from cgpm.factor.factor import FactorAnalysis
    except ImportError:
        pytest.skip('no sklearn')
        return
    with cgpm_dummy_satellites_bdb() as bdb:
        # Missing policy for class_of_orbit, perigee, period
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    MODEL apogee, launch_mass AS NUMERICAL;
                    MODEL country_of_operator AS CATEGORICAL
                )
            ''')
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                IGNORE class_of_orbit, perigee, period;
                MODEL apogee, launch_mass AS NUMERICAL;
                MODEL country_of_operator AS CATEGORICAL
            )
        ''')
        registry = {
            'factor_analysis': FactorAnalysis,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry))
        # Creating factor analysis with categorical manifest should crash.
        bdb.execute('''
            CREATE METAMODEL satellites_g0 FOR satellites(
                OVERRIDE MODEL FOR apogee, country_of_operator
                AND EXPOSE pc_1 NUMERICAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0')
        with pytest.raises(BQLError):
            # Duplicate pc_2 in LATENT and EXPOSE.
            bdb.execute('''
                CREATE METAMODEL satellites_g1 FOR satellites(
                    LATENT pc_2 CATEGORICAL,
                    OVERRIDE GENERATIVE MODEL FOR
                        apogee, launch_mass
                    AND EXPOSE pc_2 CATEGORICAL
                    USING factor_analysis(L=1)
                )
            ''')
        # Creating factor analysis with categorical latent should crash.
        bdb.execute('''
            CREATE METAMODEL satellites_g1 FOR satellites(
                OVERRIDE GENERATIVE MODEL FOR
                    apogee, launch_mass
                AND EXPOSE pc_2 CATEGORICAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1')
        # Creating factor analysis with all numerical should be ok.
        bdb.execute('''
            CREATE METAMODEL satellites_g2 FOR satellites USING cgpm(
                LATENT pc_3 NUMERICAL;

                OVERRIDE MODEL FOR apogee, launch_mass, pc_3
                USING factor_analysis(L=1)
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2')
        bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION WAIT;')