def test_crosscat_constraints(): class FakeEngine(crosscat.LocalEngine.LocalEngine): def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y, Q): self._last_Y = Y sup = super(FakeEngine, self) return sup.simple_predictive_probability_multistate( M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q) def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).simple_predictive_sample(seed=seed, M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).impute_and_confidence(seed=seed, M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) engine = FakeEngine(seed=0) mm = CrosscatMetamodel(engine) with bayesdb(metamodel=mm) as bdb: t1_schema(bdb) t1_data(bdb) bdb.execute(''' CREATE GENERATOR t1_cc FOR t1 USING crosscat( label CATEGORICAL, age NUMERICAL, weight NUMERICAL ) ''') gid = core.bayesdb_get_generator(bdb, 't1_cc') assert core.bayesdb_generator_column_number(bdb, gid, 'label') == 1 assert core.bayesdb_generator_column_number(bdb, gid, 'age') == 2 assert core.bayesdb_generator_column_number(bdb, gid, 'weight') == 3 from bayeslite.metamodels.crosscat import crosscat_cc_colno assert crosscat_cc_colno(bdb, gid, 1) == 0 assert crosscat_cc_colno(bdb, gid, 2) == 1 assert crosscat_cc_colno(bdb, gid, 3) == 2 bdb.execute('INITIALIZE 1 MODEL FOR t1_cc') bdb.execute('ANALYZE t1_cc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE PROBABILITY OF age = 8 GIVEN (weight = 16)' ' BY t1_cc').next() assert engine._last_Y == [(28, 2, 16)] bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next() bdb.execute("INFER age FROM t1_cc WHERE label = 'baz'").next() assert engine._last_Y == [(3, 0, 1), (3, 2, 32)] bdb.execute('SIMULATE weight FROM t1_cc GIVEN age = 8 LIMIT 1').next() assert engine._last_Y == [(28, 1, 8)]
def test_crosscat_constraints(): class FakeEngine(crosscat.LocalEngine.LocalEngine): def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y, Q): self._last_Y = Y sup = super(FakeEngine, self) return sup.simple_predictive_probability_multistate(M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q) def simple_predictive_sample(self, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).simple_predictive_sample(M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) def impute_and_confidence(self, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).impute_and_confidence(M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) engine = FakeEngine(seed=0) mm = CrosscatMetamodel(engine) with bayesdb(metamodel=mm) as bdb: t1_schema(bdb) t1_data(bdb) bdb.execute( """ CREATE GENERATOR t1_cc FOR t1 USING crosscat( label CATEGORICAL, age NUMERICAL, weight NUMERICAL ) """ ) gid = core.bayesdb_get_generator(bdb, "t1_cc") assert core.bayesdb_generator_column_number(bdb, gid, "label") == 1 assert core.bayesdb_generator_column_number(bdb, gid, "age") == 2 assert core.bayesdb_generator_column_number(bdb, gid, "weight") == 3 from bayeslite.metamodels.crosscat import crosscat_cc_colno assert crosscat_cc_colno(bdb, gid, 1) == 0 assert crosscat_cc_colno(bdb, gid, 2) == 1 assert crosscat_cc_colno(bdb, gid, 3) == 2 bdb.execute("INITIALIZE 1 MODEL FOR t1_cc") bdb.execute("ANALYZE t1_cc FOR 1 ITERATION WAIT") bdb.execute("ESTIMATE PROBABILITY OF age = 8 GIVEN (weight = 16)" " BY t1_cc").next() assert engine._last_Y == [(28, 2, 16)] bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next() bdb.execute("INFER age FROM t1_cc WHERE label = 'baz'").next() assert engine._last_Y == [(3, 0, 1), (3, 2, 32)] bdb.execute("SIMULATE weight FROM t1_cc GIVEN age = 8 LIMIT 1").next() assert engine._last_Y == [(28, 1, 8)]
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = "TEMP " if phrase.temp else "" ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else "" out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),)) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),)) if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,)) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, "No such column" " in generator %s table %s: %s" % (repr(phrase.simulation.generator), repr(table), repr(column_name)), ) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, "No such column" " in generator %s table %s: %s" % (repr(phrase.simulation.generator), repr(table), repr(column_name)), ) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write("SELECT ") with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(", ") with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(", ") compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = [ (core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:]) ] colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( "CREATE %sTABLE %s%s (%s)" % ( "TEMP " if phrase.temp else "", "IF NOT EXISTS " if phrase.ifnotexists else "", qn, ",".join( "%s %s" % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names) ), ) ) insert_sql = """ INSERT INTO %s (%s) VALUES (%s) """ % ( qn, ",".join(qcns), ",".join("?" for qcn in qcns), ) for row in bqlfn.bayesdb_simulate( bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples ): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?" cursor = bdb.sql_execute(sql, (phrase.name,)) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),)) bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,)) ifexists = "IF EXISTS " if phrase.ifexists else "" qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, "No such table: %s" % (repr(table),)) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + "_temp" while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp): temp += "_temp" rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),)) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),)) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError("Renaming columns" " not yet implemented.") # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column(bdb, table, cmd.old): raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new)) ) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = """ UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new}) assert bdb.sqlite3.total_changes - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = """ SELECT id FROM bayesdb_generator WHERE tabname = ? """ cursor = bdb.sql_execute(generators_sql, (table,)) for (generator_id,) in cursor: metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),)) generator_id = core.bayesdb_get_generator(bdb, cmd.generator) unset_default_sql = """ UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(unset_default_sql, (table,)) assert bdb.sqlite3.total_changes - total_changes in (0, 1) set_default_sql = """ UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(set_default_sql, (generator_id,)) assert bdb.sqlite3.total_changes - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = """ UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(unset_default_sql, (table,)) assert bdb.sqlite3.total_changes - total_changes in (0, 1) else: assert False, "Invalid alter table command: %s" % (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),)) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): def instantiate(columns): return instantiate_generator( bdb, phrase.name, phrase.table, metamodel, columns, ifnotexists=phrase.ifnotexists, default=phrase.default, ) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),)) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = """ DELETE FROM bayesdb_generator_column WHERE generator_id = ? """ bdb.sql_execute(drop_columns_sql, (generator_id,)) drop_model_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = ? """ bdb.sql_execute(drop_model_sql, (generator_id,)) drop_generator_sql = """ DELETE FROM bayesdb_generator WHERE id = ? """ bdb.sql_execute(drop_generator_sql, (generator_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, "No such generator: %s" % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),)) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),)) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = """ UPDATE bayesdb_generator SET name = ? WHERE id = ? """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb.sqlite3.total_changes - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set( modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno) ) else: existing = set( modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno) ) if 0 < len(existing): raise BQLError( bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing)) ) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = """ INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) """ for modelno in modelnos: bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0}) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError("No background analysis -- use WAIT.") # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models( bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, ) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = """ SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno """ modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno}) if cursor_value(cursor) == 0: raise BQLError( bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno)) ) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = ? """ bdb.sql_execute(drop_models_sql, (generator_id,)) else: drop_model_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno """ for modelno in modelnos: bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno}) return empty_cursor(bdb) assert False # XXX
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_generator_default( bdb, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %s' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(', ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = \ [(core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])] colnos = \ [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join( '%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)))) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate(bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError( bdb, 'Table still in use by generators: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp) or \ core.bayesdb_has_generator(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = ''' SELECT id FROM bayesdb_generator WHERE tabname = ? ''' cursor = bdb.sql_execute(generators_sql, (table, )) for (generator_id, ) in cursor: metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError( bdb, 'No such generator: %s' % (repr(cmd.generator), )) generator_id = core.bayesdb_get_generator( bdb, cmd.generator) bayesdb_schema_required(bdb, 6, "generator defaults") unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) set_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(set_default_sql, (generator_id, )) assert bdb._sqlite3.totalchanges() - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError( bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), )) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) else: def instantiate(columns): return instantiate_generator(bdb, phrase.name, phrase.table, metamodel, columns, default=phrase.default) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_generator_column WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' for modelno in modelnos: bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': 0, }) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError('No background analysis -- use WAIT.') # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) assert False # XXX
def _predict_confidence(self, bdb, genid, modelno, colno, rowid, numsamples=None): # Predicts a value for the cell [rowid, colno] with a confidence metric. # XXX Prefer accuracy over speed for imputation. if numsamples is None: numsamples = self.n_samples colnos = core.bayesdb_generator_column_numbers(bdb, genid) colnames = core.bayesdb_generator_column_names(bdb, genid) row = core.bayesdb_generator_row_values(bdb, genid, rowid) # Account for multiple imputations if imputing parents. parent_conf = 1 # Predicting lcol. if colno in self.lcols(bdb, genid): # Delegate to CC IFF # (lcol has no children OR all its children are None). children = [f for f in self.fcols(bdb, genid) if colno in self.pcols(bdb, genid, f)] if len(children) == 0 or \ all(row[i] is None for i in xrange(len(row)) if i+1 in children): return self.cc(bdb, genid).predict_confidence(bdb, self.cc_id(bdb, genid), modelno, self.cc_colno(bdb, genid, colno), rowid) else: # Obtain likelihood weighted samples from posterior. Q = [(rowid, colno)] Y = [(rowid, c, v) for c,v in zip(colnos, row) if c != colno and v is not None] samples = self.simulate(bdb, genid, modelno, Q, Y, numpredictions=numsamples) samples = [s[0] for s in samples] # Predicting fcol. else: conditions = {c:v for c,v in zip(colnames, row) if core.bayesdb_generator_column_number(bdb, genid, c) in self.pcols(bdb, genid, colno)} for colname, val in conditions.iteritems(): # Impute all missing parents. if val is None: imp_col = core.bayesdb_generator_column_number(bdb, genid, colname) imp_val, imp_conf = self.predict_confidence(bdb, genid, modelno, imp_col, rowid, numsamples=numsamples) # XXX If imputing several parents, take the overall # overall conf as min conf. If we define imp_conf as # P[imp_val = correct] then we might choose to multiply # the imp_confs, but we cannot assert that the imp_confs # are independent so multiplying is extremely conservative. parent_conf = min(parent_conf, imp_conf) conditions[colname] = imp_val assert all(v is not None for c,v in conditions.iteritems()) predictor = self.predictor(bdb, genid, colno) samples = predictor.simulate(numsamples, conditions) # Since foreign predictor does not know how to impute, imputation # shall occur here in the composer by simulate/logpdf calls. stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno) if stattype == 'categorical': # imp_conf is most frequent. imp_val = max(((val, samples.count(val)) for val in set(samples)), key=lambda v: v[1])[0] if colno in self.fcols(bdb, genid): imp_conf = np.exp(predictor.logpdf(imp_val, conditions)) else: imp_conf = sum(np.array(samples)==imp_val) / len(samples) elif stattype == 'numerical': # XXX The definition of confidence is P[k=1] where # k=1 is the number of mixture componets (we need a distribution # over GPMM to answer this question). The confidence is instead # implemented as \max_i{p_i} where p_i are the weights of a # fitted DPGMM. imp_val = np.mean(samples) imp_conf = su.continuous_imputation_confidence(samples, None, None, n_steps=1000) else: raise ValueError('Unknown stattype "{}" for a foreign predictor ' 'column encountered in predict_confidence.'.format(stattype)) return imp_val, imp_conf * parent_conf
def create_generator(self, bdb, table, schema, instantiate): # Parse the schema. (columns, lcols, _fcols, fcol_to_pcols, fcol_to_fpred, dependencies) = self.parse(schema) # Instantiate **this** generator. genid, bdbcolumns = instantiate(columns.items()) # Create internal crosscat generator. The name will be the same as # this generator name, with a _cc suffix. SUFFIX = '_cc' cc_name = bayeslite.core.bayesdb_generator_name(bdb, genid) + SUFFIX # Create strings for crosscat schema. cc_cols = ','.join('{} {}'.format(quote(c), quote(columns[c])) for c in lcols) cc_dep = [] for dep, colnames in dependencies: qcns = ','.join(map(quote, colnames)) if dep: cc_dep.append('DEPENDENT({})'.format(qcns)) else: cc_dep.append('INDEPENDENT({})'.format(qcns)) bql = """ CREATE GENERATOR {} FOR {} USING crosscat( {}, {} ); """.format(quote(cc_name), quote(table), cc_cols, ','.join(cc_dep)) bdb.execute(bql) # Convert strings to column numbers. fcolno_to_pcolnos = {} for f in fcol_to_pcols: fcolno = core.bayesdb_generator_column_number(bdb, genid, f) fcolno_to_pcolnos[fcolno] = [core.bayesdb_generator_column_number( bdb, genid, col) for col in fcol_to_pcols[f]] with bdb.savepoint(): # Save internal cc generator id. bdb.sql_execute(''' INSERT INTO bayesdb_composer_cc_id (generator_id, crosscat_generator_id) VALUES (?,?) ''', (genid, core.bayesdb_get_generator(bdb, cc_name),)) # Save lcols/fcolnos. for colno, _, _ in bdbcolumns: local = colno not in fcolno_to_pcolnos bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_owner (generator_id, colno, local) VALUES (?,?,?) ''', (genid, colno, int(local),)) # Save parents of foreign columns. for fcolno in fcolno_to_pcolnos: for pcolno in fcolno_to_pcolnos[fcolno]: bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_parents (generator_id, fcolno, pcolno) VALUES (?,?,?) ''', (genid, fcolno, pcolno,)) # Save topological order. topo = self.topological_sort(fcolno_to_pcolnos) for position, (colno, _) in enumerate(topo): bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_toposort (generator_id, colno, position) VALUES (?,?,?) ''', (genid, colno, position,)) # Save predictor names of foreign columns. for fcolno in fcolno_to_pcolnos: fp_name = fcol_to_fpred[casefold( core.bayesdb_generator_column_name(bdb,genid, fcolno))] bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_foreign_predictor (generator_id, colno, predictor_name) VALUES (?,?,?) ''', (genid, fcolno, casefold(fp_name)))