def test_bayesdb_population_fresh_row_id(): with bayesdb_population( bayesdb(), 't1', 'p1', 'p1_cc', t1_schema, lambda x: 0,\ columns=['id IGNORE','label NOMINAL', 'age NUMERICAL', 'weight NUMERICAL'])\ as (bdb, population_id, generator_id): assert core.bayesdb_population_fresh_row_id(bdb, population_id) == 1 t1_data(bdb) assert core.bayesdb_population_fresh_row_id(bdb, population_id) == \ len(t1_rows) + 1
def test_bayesdb_population_fresh_row_id(): with bayesdb_population( bayesdb(), 't1', 'p1', 'p1_cc', t1_schema, lambda x: 0,\ columns=['id IGNORE','label NOMINAL', 'age NUMERICAL', 'weight NUMERICAL'])\ as (bdb, population_id, _generator_id): assert core.bayesdb_population_fresh_row_id(bdb, population_id) == 1 t1_data(bdb) n_rows = len(t1_rows) for rowid in xrange(n_rows): assert core.bayesdb_table_has_rowid(bdb, 't1', rowid + 1) assert not core.bayesdb_table_has_rowid(bdb, 't1', n_rows + 1) assert core.bayesdb_population_fresh_row_id(bdb, population_id) == \ n_rows + 1
def test_bayesdb_population_fresh_row_id(): with bayesdb_population( bayesdb(), 't1', 'p1', 'p1_cc', t1_schema, lambda x: 0,\ columns=['id IGNORE','label NOMINAL', 'age NUMERICAL', 'weight NUMERICAL'])\ as (bdb, population_id, _generator_id): assert core.bayesdb_population_fresh_row_id(bdb, population_id) == 1 t1_data(bdb) n_rows = len(t1_rows) for rowid in xrange(n_rows): assert core.bayesdb_table_has_rowid(bdb, 't1', rowid+1) assert not core.bayesdb_table_has_rowid(bdb, 't1', n_rows+1) assert core.bayesdb_population_fresh_row_id(bdb, population_id) == \ n_rows + 1
def bql_row_column_predictive_probability(bdb, population_id, generator_id, rowid, colno): value = core.bayesdb_population_cell_value(bdb, population_id, rowid, colno) if value is None: return None # Retrieve all other values in the row. row_values = core.bayesdb_population_row_values(bdb, population_id, rowid) variable_numbers = core.bayesdb_variable_numbers(bdb, population_id, None) # Build the constraints and query from rowid, using a fresh rowid. fresh_rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) query = [(colno, value)] constraints = [(col, value) for (col, value) in zip(variable_numbers, row_values) if (value is not None) and (col != colno)] def generator_predprob(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.logpdf_joint(bdb, generator_id, fresh_rowid, query, constraints, None) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) predprobs = map(generator_predprob, generator_ids) r = logmeanexp(predprobs) return ieee_exp(r)
def bql_row_column_predictive_probability(bdb, population_id, generator_id, modelnos, rowid, targets, constraints): targets = json.loads(targets) constraints = json.loads(constraints) modelnos = _retrieve_modelnos(modelnos) # Build the constraints and query from rowid, using a fresh rowid. fresh_rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) def retrieve_values(colnos): values = [ core.bayesdb_population_cell_value(bdb, population_id, rowid, colno) for colno in colnos ] return [(c, v) for (c, v) in zip(colnos, values) if v is not None] cgpm_targets = retrieve_values(targets) # If all targets have NULL values, return None. if len(cgpm_targets) == 0: return None cgpm_constraints = retrieve_values(constraints) def generator_predprob(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.logpdf_joint(bdb, generator_id, modelnos, fresh_rowid, cgpm_targets, cgpm_constraints) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) predprobs = map(generator_predprob, generator_ids) r = logmeanexp(predprobs) return ieee_exp(r)
def bql_pdf_joint(bdb, population_id, generator_id, *args): # A nonexistent (`unobserved') row id. fake_row_id = core.bayesdb_population_fresh_row_id(bdb, population_id) i = 0 targets = [] while i < len(args): if args[i] is None: i += 1 break if i + 1 == len(args): raise ValueError('Missing logpdf target value: %r' % (args[i], )) t_colno = args[i] t_value = args[i + 1] targets.append((fake_row_id, t_colno, t_value)) i += 2 constraints = [] while i < len(args): if i + 1 == len(args): raise ValueError('Missing logpdf constraint value: %r' % (args[i], )) c_colno = args[i] c_value = args[i + 1] constraints.append((fake_row_id, c_colno, c_value)) i += 2 logp = _bql_logpdf(bdb, population_id, generator_id, targets, constraints) return ieee_exp(logp)
def bql_row_column_predictive_probability( bdb, population_id, generator_id, modelnos, rowid, targets, constraints): targets = json.loads(targets) constraints = json.loads(constraints) modelnos = _retrieve_modelnos(modelnos) # Build the constraints and query from rowid, using a fresh rowid. fresh_rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) def retrieve_values(colnos): values = [ core.bayesdb_population_cell_value(bdb, population_id, rowid, colno) for colno in colnos ] return [(c,v) for (c,v) in zip (colnos, values) if v is not None] cgpm_targets = retrieve_values(targets) # If all targets have NULL values, return None. if len(cgpm_targets) == 0: return None cgpm_constraints = retrieve_values(constraints) def generator_predprob(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) return backend.logpdf_joint( bdb, generator_id, modelnos, fresh_rowid, cgpm_targets, cgpm_constraints) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) predprobs = map(generator_predprob, generator_ids) r = logmeanexp(predprobs) return ieee_exp(r)
def bayesdb_simulate(bdb, population_id, constraints, colnos, generator_id=None, numpredictions=1, accuracy=None): """Simulate rows from a generative model, subject to constraints. Returns a list of `numpredictions` tuples, with a value for each column specified in the list `colnos`, conditioned on the constraints in the list `constraints` of tuples ``(colno, value)``. The results are simulated from the predictive distribution on fresh rows. """ rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints is not None: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,)) constraints = [ (rowid, c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb) ] targets = [(rowid, colno) for colno in colnos] def loglikelihood(generator_id, metamodel): if not constraints: return 0 return metamodel.logpdf_joint( bdb, generator_id, constraints, [], None) def simulate(generator_id, metamodel, n): return metamodel.simulate_joint( bdb, generator_id, targets, constraints, None, num_predictions=n, accuracy=accuracy) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) metamodels = [core.bayesdb_generator_metamodel(bdb, generator_id) for generator_id in generator_ids] if len(generator_ids) > 1: loglikelihoods = map(loglikelihood, generator_ids, metamodels) likelihoods = map(math.exp, loglikelihoods) total_likelihood = sum(likelihoods) if total_likelihood == 0: # XXX Show the constraints with symbolic names. raise BQLError(bdb, 'Impossible constraints: %r' % (constraints,)) probabilities = [likelihood/total_likelihood for likelihood in likelihoods] countses = bdb.np_prng.multinomial( numpredictions, probabilities, size=1) counts = countses[0] else: counts = [numpredictions] rowses = map(simulate, generator_ids, metamodels, counts) all_rows = [row for rows in rowses for row in rows] assert all(isinstance(row, (tuple, list)) for row in all_rows) return all_rows
def _retrieve_rowid_constraints(bdb, population_id, constraints): rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints, )) constraints = [(c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb)] return rowid, constraints
def _retrieve_rowid_constraints(bdb, population_id, constraints): rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,)) constraints = [ (c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb) ] return rowid, constraints
def bql_column_value_probability(bdb, population_id, generator_id, colno, value, *constraint_args): # A nonexistent (`unobserved') row id. fake_row_id = core.bayesdb_population_fresh_row_id(bdb, population_id) constraints = [] i = 0 while i < len(constraint_args): if i + 1 == len(constraint_args): raise ValueError('Odd constraint arguments: %s' % (constraint_args, )) constraint_colno = constraint_args[i] constraint_value = constraint_args[i + 1] constraints.append((fake_row_id, constraint_colno, constraint_value)) i += 2 targets = [(fake_row_id, colno, value)] logp = _bql_logpdf(bdb, population_id, generator_id, targets, constraints) return ieee_exp(logp)
def bdb(): bdb = bayesdb_open(':memory:') # Create the population of complements. bdb.sql_execute('CREATE TABLE t (a TEXT, b TEXT)') for _ in xrange(20): bdb.sql_execute('INSERT INTO t (a, b) VALUES (0,1)') for _ in xrange(20): bdb.sql_execute('INSERT INTO t (a, b) VALUES (1,0)') # Create the population and metamodel on the existing rows. bdb.execute('CREATE POPULATION p FOR t (MODEL a, b AS NOMINAL)') bdb.execute('CREATE METAMODEL m FOR p;') bdb.execute('INITIALIZE 1 MODELS FOR m;') bdb.execute('ANALYZE m FOR 1000 ITERATION WAIT (OPTIMIZED);') # Add new 'hypothetical' rows into the base table to serve as out-of- # sample probe points; only zeros, only ones, and nothing. for _ in xrange(40, 50): bdb.sql_execute('INSERT INTO t (a) VALUES (0)') for _ in xrange(50, 60): bdb.sql_execute('INSERT INTO t (b) VALUES (1)') for _ in xrange(60, 80): bdb.sql_execute('INSERT INTO t (a,b) VALUES (NULL, NULL)') # Make sure fresh_row_id 80 from the base table, not metamodel. population_id = bayesdb_get_population(bdb, 'p') assert bayesdb_population_fresh_row_id(bdb, population_id) == 81 # Make sure the cgpm only has 40 rowids incorporated. generator_id = bayesdb_get_generator(bdb, population_id, 'm') cursor = bdb.sql_execute( ''' SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )) assert cursor_value(cursor) == 40 # Turn off multiprocessing for sequence of queries. bdb.metamodels['cgpm'].set_multiprocess(False) return bdb
def bdb(): bdb = bayesdb_open(':memory:') # Create the population of complements. bdb.sql_execute('CREATE TABLE t (a TEXT, b TEXT)') for _ in xrange(20): bdb.sql_execute('INSERT INTO t (a, b) VALUES (0,1)') for _ in xrange(20): bdb.sql_execute('INSERT INTO t (a, b) VALUES (1,0)') # Create the population and generator on the existing rows. bdb.execute('CREATE POPULATION p FOR t (SET STATTYPES OF a, b TO NOMINAL)') bdb.execute('CREATE GENERATOR m FOR p;') bdb.execute('INITIALIZE 1 MODELS FOR m;') bdb.execute('ANALYZE m FOR 1000 ITERATION (OPTIMIZED);') # Add new 'hypothetical' rows into the base table to serve as out-of- # sample probe points; only zeros, only ones, and nothing. for _ in xrange(40, 50): bdb.sql_execute('INSERT INTO t (a) VALUES (0)') for _ in xrange(50, 60): bdb.sql_execute('INSERT INTO t (b) VALUES (1)') for _ in xrange(60, 80): bdb.sql_execute('INSERT INTO t (a,b) VALUES (NULL, NULL)') # Make sure fresh_row_id 80 from the base table, not generator. population_id = bayesdb_get_population(bdb, 'p') assert bayesdb_population_fresh_row_id(bdb, population_id) == 81 # Make sure the cgpm only has 40 rowids incorporated. generator_id = bayesdb_get_generator(bdb, population_id, 'm') cursor = bdb.sql_execute(''' SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id,)) assert cursor_value(cursor) == 40 # Turn off multiprocessing for sequence of queries. bdb.backends['cgpm'].set_multiprocess(False) return bdb
def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints, num_samples=1, accuracy=None): # Retrieve the population id. population_id = bayesdb_generator_population(bdb, generator_id) # If rowid exists, retrieve conditioning data from the table. if rowid != bayesdb_population_fresh_row_id(bdb, generator_id): row_values_raw = bayesdb_population_row_values( bdb, population_id, rowid) row_values = [ str(a) if isinstance(a, unicode) else a for a in row_values_raw ] row = [ entry for entry in enumerate(row_values) if entry[1] is not None ] constraints_colnos = [c[0] for c in constraints] row_colnos = [r[0] for r in row] if any([colno in constraints_colnos for colno in row_colnos]): raise BQLError(bdb, 'Overlap between constraints and' \ 'target row in simulate.') constraints.extend(row) # Prepare the query row to provide to Loom. row = {} target_num_to_name = {} for colno in targets: name = bayesdb_variable_name(bdb, generator_id, None, colno) target_num_to_name[colno] = name row[name] = '' for (colno, value) in constraints: name = bayesdb_variable_name(bdb, generator_id, None, colno) row[name] = value # Fetch the server. server = self._get_cache_entry(bdb, generator_id, 'preql_server') # Prepare the csv header. csv_headers, csv_values = zip(*row.iteritems()) lower_to_upper = {str(a).lower(): str(a) for a in csv_headers} csv_headers = lower_to_upper.keys() csv_values = [str(a) for a in csv_values] # Retrieve the samples from the server.. outfile = StringIO() writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue) reader = iter([csv_headers] + [csv_values]) server._predict(reader, num_samples, writer, False) output = writer.result() # Parse output. returned_headers = [ lower_to_upper[a] for a in output.strip().split('\r\n')[0].split(CSV_DELIMITER) ] loom_output = [ zip(returned_headers, a.split(CSV_DELIMITER)) for a in output.strip().split('\r\n')[1:] ] return_list = [] for row in loom_output: # Prepare the row. row_values = [] row_dict = dict(row) for colno in targets: colname = target_num_to_name[colno] value = row_dict[colname] stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) if not _is_nominal(stattype): value = float(value) row_values.append(value) # Add this row to the return list. return_list.append(row_values) return return_list