Exemplo n.º 1
0
def cgpm_dummy_satellites_pop_bdb():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            create population satellites for satellites_ucs with schema(
                apogee numerical;
                class_of_orbit nominal;
                country_of_operator nominal;
                launch_mass numerical;
                perigee numerical;
                period numerical
            )
        ''')
        backend = CGPM_Backend(dict(), multiprocess=0)
        bayesdb_register_backend(bdb, backend)
        yield bdb
Exemplo n.º 2
0
def cgpm_dummy_satellites_pop_bdb():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            create population satellites for satellites_ucs with schema(
                model apogee as numerical;
                model class_of_orbit as categorical;
                model country_of_operator as categorical;
                model launch_mass as numerical;
                model perigee as numerical;
                model period as numerical
            )
        ''')
        metamodel = CGPM_Metamodel(dict(), multiprocess=0)
        bayesdb_register_metamodel(bdb, metamodel)
        yield bdb
Exemplo n.º 3
0
def test_regress_bonanza__ci_integration():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_metamodel(
            bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS NOMINAL;
                MODEL country_of_operator AS NOMINAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE METAMODEL m FOR satellites WITH BASELINE crosscat;
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')

        def check_regression_variables(results, numericals, nominals):
            seen = set()
            for r in results:
                assert len(r) == 2
                variable = r[0]
                assert variable not in seen
                assert variable in numericals or \
                    any(variable.startswith('%s_dum_' % (nominal,))
                        for nominal in nominals)
                seen.add(variable)

        # Regression on 1 numerical variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites;
        ''').fetchall()
        assert len(results) == 2
        check_regression_variables(results, ['intercept', 'perigee'], [])

        # Regression on 1 nominal variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept'], ['country_of_operator'])

        # Regression on 1 nominal + 1 numerical variable.
        bdb.execute('''
            REGRESS apogee GIVEN (perigee, country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept', 'perigee'], ['country_of_operator'])

        # Regression on all variables.
        results = bdb.execute('''
            REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites;
        ''', (3,)).fetchall()
        check_regression_variables(
            results,
            ['intercept', 'perigee', 'launch_mass', 'period',],
            ['country_of_operator', 'class_of_orbit',],
        )

        # Regression on column selector subexpression with a binding.
        results = bdb.execute('''
            REGRESS apogee GIVEN (
                satellites.(
                    ESTIMATE * FROM VARIABLES OF satellites
                    ORDER BY dependence probability with apogee DESC
                    LIMIT ?
                )
            )
            USING 12 SAMPLES BY satellites MODELLED BY m USING MODEL 1;
        ''', (3,)).fetchall()

        cursor = bdb.execute('''
            ESTIMATE * FROM VARIABLES OF satellites
                ORDER BY dependence probability with apogee DESC
                LIMIT ?
        ''', (3,)).fetchall()
        top_variables = [c[0] for c in cursor]
        nominals = [
            var for var in top_variables
            if var in ['country_of_operator', 'class_of_orbit',]
        ]
        numericals = [var for var in top_variables if var not in nominals]
        check_regression_variables(
            results, numericals + ['intercept'], nominals)

        # Cannot mix * with other variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                REGRESS apogee GIVEN (*, class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()

        # Not enough data for regression, 1 unique nominal variable.
        with pytest.raises(ValueError):
            bdb.execute('''
                REGRESS apogee GIVEN (class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()
Exemplo n.º 4
0
def test_regress_bonanza__ci_integration():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(
            bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                apogee                  NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                launch_mass             NUMERICAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL;
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR m FOR satellites;
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')

        def check_regression_variables(results, numericals, nominals):
            seen = set()
            for r in results:
                assert len(r) == 2
                variable = r[0]
                assert variable not in seen
                assert variable in numericals or \
                    any(variable.startswith('%s_dum_' % (nominal,))
                        for nominal in nominals)
                seen.add(variable)

        # Regression on 1 numerical variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites;
        ''').fetchall()
        assert len(results) == 2
        check_regression_variables(results, ['intercept', 'perigee'], [])

        # Regression on 1 nominal variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept'], ['country_of_operator'])

        # Regression on 1 nominal + 1 numerical variable.
        bdb.execute('''
            REGRESS apogee GIVEN (perigee, country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept', 'perigee'], ['country_of_operator'])

        # Regression on all variables.
        results = bdb.execute('''
            REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites;
        ''', (3,)).fetchall()
        check_regression_variables(
            results,
            ['intercept', 'perigee', 'launch_mass', 'period',],
            ['country_of_operator', 'class_of_orbit',],
        )

        # Regression on column selector subexpression with a binding.
        results = bdb.execute('''
            REGRESS apogee GIVEN (
                satellites.(
                    ESTIMATE * FROM VARIABLES OF satellites
                    ORDER BY dependence probability with apogee DESC
                    LIMIT ?
                )
            )
            USING 12 SAMPLES BY satellites MODELED BY m USING MODEL 1;
        ''', (3,)).fetchall()

        cursor = bdb.execute('''
            ESTIMATE * FROM VARIABLES OF satellites
                ORDER BY dependence probability with apogee DESC
                LIMIT ?
        ''', (3,)).fetchall()
        top_variables = [c[0] for c in cursor]
        nominals = [
            var for var in top_variables
            if var in ['country_of_operator', 'class_of_orbit',]
        ]
        numericals = [var for var in top_variables if var not in nominals]
        check_regression_variables(
            results, numericals + ['intercept'], nominals)

        # Cannot mix * with other variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                REGRESS apogee GIVEN (*, class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()

        # Not enough data for regression, 1 unique nominal variable.
        with pytest.raises(ValueError):
            bdb.execute('''
                REGRESS apogee GIVEN (class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()
Exemplo n.º 5
0
def test_analysis_subproblems_basic():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE ANALYSIS SCHEMA g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 4 ANALYSES FOR g0')

        # Test each subproblem individually except for variable hyperparameters.
        for optimized in [
                '',
                'OPTIMIZED;',
        ]:
            for subproblem in [
                    'variable clustering',
                    'variable clustering concentration',
                    'row clustering',
                    'row clustering concentration',
            ]:
                bdb.execute('''
                    ANALYZE g0 ANALYSES 0,1 FOR 4 ITERATION WAIT(
                        SUBPROBLEM %s;
                        %s
                    );
                ''' % (subproblem, optimized))

        # Test variable hyperparameters.
        bdb.execute('''
            ANALYZE g0 FOR 1 ITERATION WAIT (
                VARIABLES period, launch_mass;
                SUBPROBLEM variable hyperparameters;
            )
        ''')
        with pytest.raises(BQLError):
            # OPTIMIZED backend does not support variable hyperparameters.
            bdb.execute('''
                ANALYZE g0 FOR 1 SECONDS WAIT (
                    SUBPROBLEM variable hyperparameters;
                    OPTIMIZED;
                )
            ''')

        # Test rows.
        generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0')
        cursor = bdb.execute(
            '''
            SELECT table_rowid FROM  bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, ))
        subsample_rows = [c[0] for c in cursor]
        bad_rows = [i for i in xrange(20) if i not in subsample_rows]
        for optimized in ['', 'OPTIMIZED;']:
            bdb.execute('''
                ANALYZE g0 ANALYSIS 3 FOR 1 ITERATION WAIT (
                    VARIABLES class_of_orbit;
                    ROWS %s;
                    SUBPROBLEMS (
                        row clustering,
                        row clustering concentration
                    );
                    %s
            )
            ''' % (','.join(map(str, subsample_rows)), optimized))
            with pytest.raises(BQLError):
                # Fail on rows not in the population or subsample.
                bdb.execute('''
                    ANALYZE g0 ANALYSIS 3 FOR 1 ITERATION WAIT (
                        VARIABLES class_of_orbit;
                        ROWS %s;
                        SUBPROBLEMS (
                            row clustering,
                            row clustering concentration
                        );
                        %s
                )
                ''' % (','.join(map(str, bad_rows)), optimized))
Exemplo n.º 6
0
def test_analysis_subproblems_basic():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee TO NUMERICAL;
                SET STATTYPE OF class_of_orbit TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass TO NUMERICAL;
                SET STATTYPE OF perigee TO NUMERICAL;
                SET STATTYPE OF period TO NUMERICAL
            )
        ''')
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 4 MODELS FOR g0')

        # Test each subproblem individually except for variable hyperparameters.
        for optimized in ['', 'OPTIMIZED;',]:
            for subproblem in [
                'variable clustering',
                'variable clustering concentration',
                'row clustering',
                'row clustering concentration',
            ]:
                bdb.execute('''
                    ANALYZE g0 MODELS 0,1 FOR 4 ITERATION(
                        SUBPROBLEM %s;
                        %s
                    );
                ''' % (subproblem, optimized))

        # Test variable hyperparameters.
        bdb.execute('''
            ANALYZE g0 FOR 1 ITERATION (
                VARIABLES period, launch_mass;
                SUBPROBLEM variable hyperparameters;
            )
        ''')
        with pytest.raises(BQLError):
            # OPTIMIZED backend does not support variable hyperparameters.
            bdb.execute('''
                ANALYZE g0 FOR 1 SECONDS (
                    SUBPROBLEM variable hyperparameters;
                    OPTIMIZED;
                )
            ''')

        # Test rows.
        generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0')
        cursor = bdb.execute('''
            SELECT table_rowid FROM  bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id,))
        subsample_rows = [c[0] for c in cursor]
        bad_rows = [i for i in xrange(20) if i not in subsample_rows]
        for optimized in ['', 'OPTIMIZED;']:
            bdb.execute('''
                ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                    VARIABLES class_of_orbit;
                    ROWS %s;
                    SUBPROBLEMS (
                        row clustering,
                        row clustering concentration
                    );
                    %s
            )
            ''' % (','.join(map(str, subsample_rows)), optimized))
            with pytest.raises(BQLError):
                # Fail on rows not in the population or subsample.
                bdb.execute('''
                    ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                        VARIABLES class_of_orbit;
                        ROWS %s;
                        SUBPROBLEMS (
                            row clustering,
                            row clustering concentration
                        );
                        %s
                )
                ''' % (','.join(map(str, bad_rows)), optimized))