def test_bayesdb_generator_fresh_row_id(): with bayesdb_generator(bayesdb(), 't1', 't1_cc', t1_schema, lambda x: 0,\ columns=['label CATEGORICAL', 'age NUMERICAL', 'weight NUMERICAL'])\ as (bdb, generator_id): assert core.bayesdb_generator_fresh_row_id(bdb, generator_id) == 1 t1_data(bdb) assert core.bayesdb_generator_fresh_row_id(bdb, generator_id) == \ len(t1_rows) + 1
def test_bayesdb_generator_fresh_row_id(): with bayesdb_generator( bayesdb(), "t1", "t1_cc", t1_schema, lambda x: 0, columns=["label CATEGORICAL", "age NUMERICAL", "weight NUMERICAL"], ) as (bdb, generator_id): assert core.bayesdb_generator_fresh_row_id(bdb, generator_id) == 1 t1_data(bdb) assert core.bayesdb_generator_fresh_row_id(bdb, generator_id) == len(t1_rows) + 1
def bayesdb_simulate(bdb, generator_id, constraints, colnos, modelno=None, numpredictions=1): """Simulate rows from a generative model, subject to constraints. Returns a list of `numpredictions` tuples, with a value for each column specified in the list `colnos`, conditioned on the constraints in the list `constraints` of tuples ``(colno, value)``. The results are simulated from the predictive distribution on fresh rows. """ metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) fake_rowid = core.bayesdb_generator_fresh_row_id(bdb, generator_id) targets = [(fake_rowid, colno) for colno in colnos] if constraints is not None: constraints = [(fake_rowid, colno, val) for colno, val in constraints] return metamodel.simulate_joint(bdb, generator_id, targets, constraints, modelno, num_predictions=numpredictions)
def bql_pdf_joint(bdb, generator_id, modelno, *args): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # A nonexistent (`unobserved') row id. fake_row_id = core.bayesdb_generator_fresh_row_id(bdb, generator_id) i = 0 targets = [] while i < len(args): if args[i] == -1: i += 1 break if i + 1 == len(args): raise ValueError('Missing logpdf target value: %r' % (args[i], )) t_colno = args[i] t_value = args[i + 1] targets.append((fake_row_id, t_colno, t_value)) i += 2 constraints = [] while i < len(args): if i + 1 == len(args): raise ValueError('Missing logpdf constraint value: %r' % (args[i], )) c_colno = args[i] c_value = args[i + 1] constraints.append((fake_row_id, c_colno, c_value)) i += 2 logp = metamodel.logpdf_joint(bdb, generator_id, targets, constraints, modelno) return ieee_exp(logp)
def bql_pdf_joint(bdb, generator_id, modelno, *args): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # A nonexistent (`unobserved') row id. fake_row_id = core.bayesdb_generator_fresh_row_id(bdb, generator_id) i = 0 targets = [] while i < len(args): if args[i] == -1: i += 1 break if i + 1 == len(args): raise ValueError('Missing logpdf target value: %r' % (args[i],)) t_colno = args[i] t_value = args[i + 1] targets.append((fake_row_id, t_colno, t_value)) i += 2 constraints = [] while i < len(args): if i + 1 == len(args): raise ValueError('Missing logpdf constraint value: %r' % (args[i],)) c_colno = args[i] c_value = args[i + 1] constraints.append((fake_row_id, c_colno, c_value)) i += 2 logp = metamodel.logpdf_joint(bdb, generator_id, targets, constraints, modelno) return ieee_exp(logp)
def column_mutual_information(self, bdb, genid, modelno, colno0, colno1, numsamples=None): if numsamples is None: numsamples = self.n_samples # XXX Aggregator only. row_id = core.bayesdb_generator_fresh_row_id(bdb, genid) X = [(row_id, colno0)] W = [(row_id, colno1)] Z = Y = [] if modelno is None: modelnos = core.bayesdb_generator_modelnos(bdb, genid) else: modelnos = [modelno] with bdb.savepoint(): mi = sum(self.conditional_mutual_information( bdb, genid, modelno, X, W, Z, Y) for modelno in modelnos) / float(len(modelnos)) return mi
def bql_column_value_probability(bdb, generator_id, modelno, colno, value, *constraint_args): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # A nonexistent (`unobserved') row id. fake_row_id = core.bayesdb_generator_fresh_row_id(bdb, generator_id) constraints = [] i = 0 while i < len(constraint_args): if i + 1 == len(constraint_args): raise ValueError('Odd constraint arguments: %s' % (constraint_args, )) constraint_colno = constraint_args[i] constraint_value = constraint_args[i + 1] constraints.append((fake_row_id, constraint_colno, constraint_value)) i += 2 targets = [(fake_row_id, colno, value)] r = metamodel.logpdf_joint(bdb, generator_id, targets, constraints, modelno) return ieee_exp(r)
def bql_column_value_probability(bdb, generator_id, modelno, colno, value, *constraint_args): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # A nonexistent (`unobserved') row id. fake_row_id = core.bayesdb_generator_fresh_row_id(bdb, generator_id) constraints = [] i = 0 while i < len(constraint_args): if i + 1 == len(constraint_args): raise ValueError('Odd constraint arguments: %s' % (constraint_args,)) constraint_colno = constraint_args[i] constraint_value = constraint_args[i + 1] constraints.append((fake_row_id, constraint_colno, constraint_value)) i += 2 targets = [(fake_row_id, colno, value)] r = metamodel.logpdf_joint( bdb, generator_id, targets, constraints, modelno) return ieee_exp(r)
def predict_confidence(self, bdb, generator_id, modelno, colno, rowid, numsamples=None): if not numsamples: numsamples = 2 assert numsamples > 0 def _impute_categorical(sample): counts = Counter(s[0] for s in sample) mode_count = max(counts[v] for v in counts) pred = iter(v for v in counts if counts[v] == mode_count).next() conf = float(mode_count) / numsamples return pred, conf def _impute_numerical(sample): pred = sum(s[0] for s in sample) / float(len(sample)) conf = 0 # XXX Punt confidence for now return pred, conf constraints = [] # If rowid is a hypothetical cell for cgpm (did not exist at the time # of INITIALIZE), but exists in the base table (by INSERT INTO), then # retrieve all values for rowid as the constraints. exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id) max_cgpm_rowid = bdb.sql_execute( ''' SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )).fetchall()[0][0] hypothetical = rowid > max_cgpm_rowid if exists and hypothetical: population_id = core.bayesdb_generator_population( bdb, generator_id) # Retrieve all other variables except colno, and ignore latents in # generator_id, and place them in the constraints. pop_names = core.bayesdb_variable_names(bdb, population_id, None) avoid_name = core.bayesdb_variable_name(bdb, population_id, colno) constraints_names = [n for n in pop_names if n != avoid_name] # Obtain the row. qt_names = str.join(',', map(sqlite3_quote_name, constraints_names)) qt_table = sqlite3_quote_name( core.bayesdb_population_table(bdb, population_id)) data = bdb.sql_execute( ''' SELECT %s FROM %s WHERE oid = ? ''' % ( qt_names, qt_table, ), (rowid, )).fetchall()[0] # Build the constraints. pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None) constraints_nos = [n for n in pop_nos if n != colno] # import ipdb; ipdb.set_trace() assert len(data) == len(constraints_nos) constraints = [(rowid, c, v) for c, v in zip(constraints_nos, data) if (v is not None) and v] # Retrieve the samples. sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)], constraints, modelno, numsamples) # Determine the imputation strategy (mode or mean). stattype = core.bayesdb_variable_stattype( bdb, core.bayesdb_generator_population(bdb, generator_id), colno) if _is_categorical(stattype): return _impute_categorical(sample) else: return _impute_numerical(sample)