def bayesdb_simulate(bdb, population_id, constraints, colnos, generator_id=None, numpredictions=1, accuracy=None): """Simulate rows from a generative model, subject to constraints. Returns a list of `numpredictions` tuples, with a value for each column specified in the list `colnos`, conditioned on the constraints in the list `constraints` of tuples ``(colno, value)``. The results are simulated from the predictive distribution on fresh rows. """ rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints is not None: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,)) constraints = [ (rowid, c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb) ] targets = [(rowid, colno) for colno in colnos] def loglikelihood(generator_id, metamodel): if not constraints: return 0 return metamodel.logpdf_joint( bdb, generator_id, constraints, [], None) def simulate(generator_id, metamodel, n): return metamodel.simulate_joint( bdb, generator_id, targets, constraints, None, num_predictions=n, accuracy=accuracy) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) metamodels = [core.bayesdb_generator_metamodel(bdb, generator_id) for generator_id in generator_ids] if len(generator_ids) > 1: loglikelihoods = map(loglikelihood, generator_ids, metamodels) likelihoods = map(math.exp, loglikelihoods) total_likelihood = sum(likelihoods) if total_likelihood == 0: # XXX Show the constraints with symbolic names. raise BQLError(bdb, 'Impossible constraints: %r' % (constraints,)) probabilities = [likelihood/total_likelihood for likelihood in likelihoods] countses = bdb.np_prng.multinomial( numpredictions, probabilities, size=1) counts = countses[0] else: counts = [numpredictions] rowses = map(simulate, generator_ids, metamodels, counts) all_rows = [row for rows in rowses for row in rows] assert all(isinstance(row, (tuple, list)) for row in all_rows) return all_rows
def _retrieve_rowid_constraints(bdb, population_id, constraints): rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints, )) constraints = [(c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb)] return rowid, constraints
def _retrieve_rowid_constraints(bdb, population_id, constraints): rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,)) constraints = [ (c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb) ] return rowid, constraints
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Table already exists: %s' % (repr(phrase.name), )) bayesdb_read_csv_file(bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_population(bdb, phrase.simulation.population): raise BQLError( bdb, 'No such population: %s' % (phrase.simulation.population, )) population_id = core.bayesdb_get_population( bdb, phrase.simulation.population) generator_id = None if phrase.simulation.generator is not None: if not core.bayesdb_has_generator(bdb, population_id, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %r' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.simulation.generator) table = core.bayesdb_population_table(bdb, population_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such variable' ' in population %r: %s' % (phrase.simulation.population, column_name)) for column_name, _expression in phrase.simulation.constraints: cn = casefold(column_name) if (cn not in column_sqltypes and cn not in core.bayesdb_rowid_tokens(bdb)): raise BQLError( bdb, 'No such variable in population %s: %s' % (phrase.simulation.population, column_name)) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) def map_var(var): if casefold(var) not in core.bayesdb_rowid_tokens(bdb): return core.bayesdb_variable_number( bdb, population_id, generator_id, var) else: return casefold(var) def map_constraint(((var, _expression), value)): return (map_var(var), value) constraints = map( map_constraint, zip(phrase.simulation.constraints, cursor[0][1:])) colnos = map(map_var, column_names) schema = ','.join('%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)) bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, schema)) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate( bdb, population_id, constraints, colnos, generator_id=generator_id, numpredictions=nsamples, accuracy=phrase.simulation.accuracy): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb)
def map_var(var): if casefold(var) not in core.bayesdb_rowid_tokens(bdb): return core.bayesdb_variable_number( bdb, population_id, generator_id, var) else: return casefold(var)