def bayesdb_generator_column_stattypes(bdb, generator_id): column_stattypes = {} for name in core.bayesdb_generator_column_names(bdb, generator_id): stattype = core.bayesdb_generator_column_stattype( bdb, generator_id, name) column_stattypes[casefold(name)] = casefold(stattype) return column_stattypes
def bayesdb_generator_column_stattypes(bdb, generator_id): column_stattypes = {} for name in core.bayesdb_generator_column_names(bdb, generator_id): stattype = core.bayesdb_generator_column_stattype(bdb, generator_id, name) column_stattypes[casefold(name)] = casefold(stattype) return column_stattypes
def parse(schema, subsample_default): '''Parses a generator schema as passed to CrosscatMetamodel. schema is a tokenized expression of the form [['GUESS', ['*']], ['x', 'NUMERICAL'], ...] that is passed to CrosscatMetamodel.create_generator and represents the argument to "crosscat" in CREATE GENERATOR ... FOR ... USING crosscat(...). Returns a GeneratorSchema. See test_crosscat_generator_schema.py for examples. ''' guess = False subsample = subsample_default columns = [] dep_constraints = [] for directive in schema: if directive == []: # Skip extra commas so you can write # # CREATE GENERATOR t_cc FOR t USING crosscat( # x, # y, # z, # ) continue if (not isinstance(directive, list) or len(directive) != 2 or not isinstance(directive[0], basestring)): raise BQLError( None, 'Invalid crosscat column model directive: %r' % (directive, )) op = casefold(directive[0]) if op == 'guess' and directive[1] == ['*']: guess = True elif (op == 'subsample' and isinstance(directive[1], list) and len(directive[1]) == 1): subsample = _parse_subsample_clause(directive[1][0]) elif op == 'dependent': constraint = (_parse_dependent_clause(directive[1]), True) dep_constraints.append(constraint) elif op == 'independent': constraint = (_parse_dependent_clause(directive[1]), False) dep_constraints.append(constraint) elif op != 'guess' and casefold(directive[1]) != 'guess': columns.append((directive[0], directive[1])) else: raise BQLError( None, 'Invalid crosscat column model: %r' % (directive), ) return GeneratorSchema(guess=guess, subsample=subsample, columns=columns, dep_constraints=dep_constraints)
def parse(schema, subsample_default): '''Parses a generator schema as passed to CrosscatMetamodel. schema is a tokenized expression of the form [['GUESS', ['*']], ['x', 'NUMERICAL'], ...] that is passed to CrosscatMetamodel.create_generator and represents the argument to "crosscat" in CREATE GENERATOR ... FOR ... USING crosscat(...). Returns a GeneratorSchema. See test_crosscat_generator_schema.py for examples. ''' guess = False subsample = subsample_default columns = [] dep_constraints = [] for directive in schema: if directive == []: # Skip extra commas so you can write # # CREATE GENERATOR t_cc FOR t USING crosscat( # x, # y, # z, # ) continue if (not isinstance(directive, list) or len(directive) != 2 or not isinstance(directive[0], basestring)): raise BQLError( None, 'Invalid crosscat column model directive: %r' % (directive,)) op = casefold(directive[0]) if op == 'guess' and directive[1] == ['*']: guess = True elif (op == 'subsample' and isinstance(directive[1], list) and len(directive[1]) == 1): subsample = _parse_subsample_clause(directive[1][0]) elif op == 'dependent': constraint = (_parse_dependent_clause(directive[1]), True) dep_constraints.append(constraint) elif op == 'independent': constraint = (_parse_dependent_clause(directive[1]), False) dep_constraints.append(constraint) elif op != 'guess' and casefold(directive[1]) != 'guess': columns.append((directive[0], directive[1])) else: raise BQLError( None, 'Invalid crosscat column model: %r' % (directive),) return GeneratorSchema( guess=guess, subsample=subsample, columns=columns, dep_constraints=dep_constraints)
def bayesdb_add_variable(bdb, population_id, name, stattype): """Adds a variable to the population, with colno from the base table.""" table_name = bayesdb_population_table(bdb, population_id) colno = bayesdb_table_column_number(bdb, table_name, name) # Use the original case of the variable in the table. name_unfolded = bayesdb_table_column_name(bdb, table_name, colno) assert casefold(name) == casefold(name_unfolded) bdb.sql_execute( ''' INSERT INTO bayesdb_variable (population_id, name, colno, stattype) VALUES (?, ?, ?, ?) ''', (population_id, name_unfolded, colno, stattype))
def tokenize(tokenses): for token in intersperse(',', [flatten(tokens) for tokens in tokenses]): if isinstance(token, str): if casefold(token) in KEYWORDS: yield KEYWORDS[casefold(token)], token elif token in PUNCTUATION: yield PUNCTUATION[token], token else: # XXX check for alphanumeric/_ yield grammar.L_NAME, token elif isinstance(token, (int, float)): yield grammar.L_NUMBER, token else: raise IOError('Invalid token: %r' % (token,)) yield 0, '' # EOF
def _parse_subsample_clause(clause): if isinstance(clause, basestring) and casefold(clause) == 'off': return False elif isinstance(clause, int): return clause else: raise BQLError(None, 'Invalid subsampling: %r' % (clause, ))
def _parse_subsample_clause(clause): if isinstance(clause, basestring) and casefold(clause) == 'off': return False elif isinstance(clause, int): return clause else: raise BQLError(None, 'Invalid subsampling: %r' % (clause,))
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes): column_names = core.bayesdb_table_column_names(bdb, table) qcns = map(sqlite3_quote_name, column_names) assert all(column_stattypes[name] in allowed_column_stattypes for name in column_stattypes) column_name_set = set(casefold(name) for name in column_names) for name in column_stattypes: if name not in column_name_set: raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)]) for name, qcn in zip(column_names, qcns)) qg = sqlite3_quote_name(generator) qt = sqlite3_quote_name(table) qmm = 'crosscat' bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' % (qg, qt, qmm, schema))
def _cmd_interactive_pairplot(self, query, sql=None, **kwargs): population = kwargs.get('population', None) if population is None: raise ValueError('Specify --population=<name> argument.') c = self._bdb.sql_execute(query) if sql else self._bdb.execute(query) df = utils_bql.cursor_to_df(c) schema = utils_mml.get_schema_as_list(self._bdb, population) for colname in df.columns: drop = True for entry in schema: if casefold(entry['name']) == casefold(colname): drop = False entry['name'] = colname if drop: print "Ignoring non-modelled column %s" % (colname, ) del df[colname] return jsviz.interactive_pairplot(df, schema)
def default_dist(var, stattype): stattype = casefold(stattype) if stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == stattype else: unknown_stattype[var] = stattype return None dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var) return dist, params
def scan_nampar(scanner, text): text = casefold(text) n = None if text in scanner.nampar_map: n = scanner.nampar_map[text] else: # Numbered parameters are 1-indexed. scanner.n_numpar += 1 n = scanner.n_numpar scanner.nampar_map[text] = n scanner.produce(grammar.L_NAMPAR, (n, text))
def register_foreign_predictor(self, builder): """Register an object which builds a foreign predictor. Explicitly initializing a foreign predictor instance is not necessary. The `composer` will create, train, and serialize all foreign predictors declared in the `schema` when the BQL query INITIALIZE is called. Foreign predictors must be registered each time the database is launched. Parameters ---------- builder : :class:`.IBayesDBForeignPredictorFactory` The pattern used by the extant predictors is to include these four methods in the class implementing :class:`~.IBayesDBForeignPredictor` as `@classmethods`. For example, registering :class:`bdbcontrib.predictors.random_forest.RandomForest` is achieved by registering the **class** instance:: >> from bdbcontrib.predictors.random_forest import RandomForest >> composer.register_foreign_predictor(RandomForest) """ # Validate the builder. # Not isinstance(builder, predictor.IBayesDBForeignPreidctorFactory) # because the pattern using classes and class methods does not make the # classes be instances of that. assert hasattr(builder, 'create') assert hasattr(builder, 'serialize') assert hasattr(builder, 'deserialize') assert hasattr(builder, 'name') # Check for duplicates. if casefold(builder.name()) in self.predictor_builder: raise BLE(ValueError( 'A foreign predictor with name "{}" has already ' 'been registered. Currently registered: {}'.format( builder.name(), self.predictor_builder))) self.predictor_builder[casefold(builder.name())] = builder
def subsample_table_columns(bdb, table, new_table, limit, keep, drop, seed): """Return a subsample of the columns in the table.""" if not bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (table, )) if bayesdb_has_table(bdb, new_table): raise ValueError('Table already exists: %s' % (new_table, )) keep = map(casefold, keep) drop = map(casefold, drop) skip = keep + drop unknown = [ column for column in skip if not bayesdb_table_has_column(bdb, table, column) ] if unknown: raise ValueError('No such columns: %s' % (unknown, )) overlap = [column for column in drop if column in keep] if overlap: raise ValueError('Cannot both drop and keep columns: %s' % (overlap, )) num_sample = limit - len(keep) if num_sample < 0: raise ValueError('Must sample at least as many columns to keep.') subselect_columns = [ column for column in bayesdb_table_column_names(bdb, table) if casefold(column) not in skip ] rng = np.random.RandomState(seed) subsample_columns = rng.choice(subselect_columns, replace=False, size=min(len(subselect_columns), num_sample)) qt = bql_quote_name(table) qnt = bql_quote_name(new_table) qc = ','.join(map(bql_quote_name, itertools.chain(keep, subsample_columns))) cursor = bdb.execute(''' CREATE TABLE %s AS SELECT %s FROM %s ''' % (qnt, qc, qt)) return cursor_to_df(cursor)
def bayesdb_has_stattype(bdb, stattype): sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype' cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)}) return cursor_value(cursor) > 0
def bayesdb_guess_stattypes(column_names, rows, count_cutoff=None, ratio_cutoff=None, overrides=None): """Heuristically guess statistical types for the data in `rows`. Return a list of statistical types corresponding to the columns named in the list `column_names`. :param int count_cutoff: number of distinct values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param real ratio_cutoff: ratio of distinct values to total values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param list overrides: list of ``(name, stattype)``, overriding any guessed statistical type for columns by those names In addition to statistical types, the overrides may specify ``key`` or ``ignore``. """ # Fill in default arguments. if count_cutoff is None: count_cutoff = 20 if ratio_cutoff is None: ratio_cutoff = 0.02 if overrides is None: overrides = [] # Build a set of the column names. column_name_set = set() duplicates = set() for name in column_names: if casefold(name) in column_name_set: duplicates.add(name) column_name_set.add(casefold(name)) if 0 < len(duplicates): raise ValueError('Duplicate column names: %s' % (repr(list(duplicates),))) # Build a map for the overrides. # # XXX Support more than just stattype: allow arbitrary column # descriptions. override_map = {} unknown = set() duplicates = set() for name, stattype in overrides: if casefold(name) not in column_name_set: unknown.add(name) continue if casefold(name) in override_map: duplicates.add(name) continue override_map[casefold(name)] = casefold(stattype) if 0 < len(unknown): raise ValueError('Unknown columns overridden: %s' % (repr(list(unknown)),)) if 0 < len(duplicates): raise ValueError('Duplicate columns overridden: %s' % (repr(list(duplicates)),)) # Sanity-check the inputs. ncols = len(column_names) assert ncols == len(unique(map(casefold, column_names))) for ri, row in enumerate(rows): if len(row) < ncols: raise ValueError('Row %d: Too few columns: %d < %d' % (ri, len(row), ncols)) if len(row) > ncols: raise ValueError('Row %d: Too many columns: %d > %d' % (ri, len(row), ncols)) # Find a key first, if it has been specified as an override. key = None duplicate_keys = set() for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: if override_map[casefold(column_name)] == 'key': if key is not None: duplicate_keys.add(column_name) continue column = integerify(rows, ci) if not column: column = [row[ci] for row in rows] if not keyable_p(column): raise ValueError('Column non-unique but specified as key' ': %s' % (repr(column_name),)) key = column_name if 0 < len(duplicate_keys): raise ValueError('Multiple columns overridden as keys: %s' % (repr(list(duplicate_keys)),)) # Now go through and guess the other column stattypes or use the # override. stattypes = [] for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: stattype = override_map[casefold(column_name)] else: numericable = True column = integerify(rows, ci) if not column: column = floatify(rows, ci) if not column: column = [row[ci] for row in rows] numericable = False if key is None and keyable_p(column): stattype = 'key' key = column_name elif numericable and \ numerical_p(column, count_cutoff, ratio_cutoff): stattype = 'numerical' else: stattype = 'categorical' stattypes.append(stattype) return stattypes
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = "TEMP " if phrase.temp else "" ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else "" out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),)) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),)) if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,)) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, "No such column" " in generator %s table %s: %s" % (repr(phrase.simulation.generator), repr(table), repr(column_name)), ) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, "No such column" " in generator %s table %s: %s" % (repr(phrase.simulation.generator), repr(table), repr(column_name)), ) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write("SELECT ") with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(", ") with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(", ") compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = [ (core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:]) ] colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( "CREATE %sTABLE %s%s (%s)" % ( "TEMP " if phrase.temp else "", "IF NOT EXISTS " if phrase.ifnotexists else "", qn, ",".join( "%s %s" % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names) ), ) ) insert_sql = """ INSERT INTO %s (%s) VALUES (%s) """ % ( qn, ",".join(qcns), ",".join("?" for qcn in qcns), ) for row in bqlfn.bayesdb_simulate( bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples ): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?" cursor = bdb.sql_execute(sql, (phrase.name,)) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),)) bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,)) ifexists = "IF EXISTS " if phrase.ifexists else "" qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, "No such table: %s" % (repr(table),)) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + "_temp" while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp): temp += "_temp" rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),)) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),)) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError("Renaming columns" " not yet implemented.") # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column(bdb, table, cmd.old): raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new)) ) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = """ UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new}) assert bdb.sqlite3.total_changes - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = """ SELECT id FROM bayesdb_generator WHERE tabname = ? """ cursor = bdb.sql_execute(generators_sql, (table,)) for (generator_id,) in cursor: metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),)) generator_id = core.bayesdb_get_generator(bdb, cmd.generator) unset_default_sql = """ UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(unset_default_sql, (table,)) assert bdb.sqlite3.total_changes - total_changes in (0, 1) set_default_sql = """ UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(set_default_sql, (generator_id,)) assert bdb.sqlite3.total_changes - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = """ UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(unset_default_sql, (table,)) assert bdb.sqlite3.total_changes - total_changes in (0, 1) else: assert False, "Invalid alter table command: %s" % (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),)) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): def instantiate(columns): return instantiate_generator( bdb, phrase.name, phrase.table, metamodel, columns, ifnotexists=phrase.ifnotexists, default=phrase.default, ) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),)) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = """ DELETE FROM bayesdb_generator_column WHERE generator_id = ? """ bdb.sql_execute(drop_columns_sql, (generator_id,)) drop_model_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = ? """ bdb.sql_execute(drop_model_sql, (generator_id,)) drop_generator_sql = """ DELETE FROM bayesdb_generator WHERE id = ? """ bdb.sql_execute(drop_generator_sql, (generator_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, "No such generator: %s" % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),)) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),)) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = """ UPDATE bayesdb_generator SET name = ? WHERE id = ? """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb.sqlite3.total_changes - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set( modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno) ) else: existing = set( modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno) ) if 0 < len(existing): raise BQLError( bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing)) ) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = """ INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) """ for modelno in modelnos: bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0}) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError("No background analysis -- use WAIT.") # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models( bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, ) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = """ SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno """ modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno}) if cursor_value(cursor) == 0: raise BQLError( bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno)) ) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = ? """ bdb.sql_execute(drop_models_sql, (generator_id,)) else: drop_model_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno """ for modelno in modelnos: bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno}) return empty_cursor(bdb) assert False # XXX
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_generator_default( bdb, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %s' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(', ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = \ [(core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])] colnos = \ [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join( '%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)))) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate(bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError( bdb, 'Table still in use by generators: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp) or \ core.bayesdb_has_generator(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = ''' SELECT id FROM bayesdb_generator WHERE tabname = ? ''' cursor = bdb.sql_execute(generators_sql, (table, )) for (generator_id, ) in cursor: metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError( bdb, 'No such generator: %s' % (repr(cmd.generator), )) generator_id = core.bayesdb_get_generator( bdb, cmd.generator) bayesdb_schema_required(bdb, 6, "generator defaults") unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) set_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(set_default_sql, (generator_id, )) assert bdb._sqlite3.totalchanges() - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError( bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), )) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) else: def instantiate(columns): return instantiate_generator(bdb, phrase.name, phrase.table, metamodel, columns, default=phrase.default) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_generator_column WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' for modelno in modelnos: bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': 0, }) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError('No background analysis -- use WAIT.') # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) assert False # XXX
def p_foreign_name(self, foreign): return casefold(foreign) def p_given_opt_none(self): return []
def _is_continuous(stattype): return casefold(stattype) in ['cyclic', 'numerical', 'boolean']
def create_generator(self, bdb, table, schema, instantiate): # Parse the schema. (columns, lcols, _fcols, fcol_to_pcols, fcol_to_fpred, dependencies) = self.parse(schema) # Instantiate **this** generator. genid, bdbcolumns = instantiate(columns.items()) # Create internal crosscat generator. The name will be the same as # this generator name, with a _cc suffix. SUFFIX = '_cc' cc_name = bayeslite.core.bayesdb_generator_name(bdb, genid) + SUFFIX # Create strings for crosscat schema. cc_cols = ','.join('{} {}'.format(quote(c), quote(columns[c])) for c in lcols) cc_dep = [] for dep, colnames in dependencies: qcns = ','.join(map(quote, colnames)) if dep: cc_dep.append('DEPENDENT({})'.format(qcns)) else: cc_dep.append('INDEPENDENT({})'.format(qcns)) bql = """ CREATE GENERATOR {} FOR {} USING crosscat( {}, {} ); """.format(quote(cc_name), quote(table), cc_cols, ','.join(cc_dep)) bdb.execute(bql) # Convert strings to column numbers. fcolno_to_pcolnos = {} for f in fcol_to_pcols: fcolno = core.bayesdb_generator_column_number(bdb, genid, f) fcolno_to_pcolnos[fcolno] = [core.bayesdb_generator_column_number( bdb, genid, col) for col in fcol_to_pcols[f]] with bdb.savepoint(): # Save internal cc generator id. bdb.sql_execute(''' INSERT INTO bayesdb_composer_cc_id (generator_id, crosscat_generator_id) VALUES (?,?) ''', (genid, core.bayesdb_get_generator(bdb, cc_name),)) # Save lcols/fcolnos. for colno, _, _ in bdbcolumns: local = colno not in fcolno_to_pcolnos bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_owner (generator_id, colno, local) VALUES (?,?,?) ''', (genid, colno, int(local),)) # Save parents of foreign columns. for fcolno in fcolno_to_pcolnos: for pcolno in fcolno_to_pcolnos[fcolno]: bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_parents (generator_id, fcolno, pcolno) VALUES (?,?,?) ''', (genid, fcolno, pcolno,)) # Save topological order. topo = self.topological_sort(fcolno_to_pcolnos) for position, (colno, _) in enumerate(topo): bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_toposort (generator_id, colno, position) VALUES (?,?,?) ''', (genid, colno, position,)) # Save predictor names of foreign columns. for fcolno in fcolno_to_pcolnos: fp_name = fcol_to_fpred[casefold( core.bayesdb_generator_column_name(bdb,genid, fcolno))] bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_foreign_predictor (generator_id, colno, predictor_name) VALUES (?,?,?) ''', (genid, fcolno, casefold(fp_name)))
def _create_population(bdb, phrase): # Retrieve the (possibility implicit) population name. population_name = phrase.name or phrase.table implicit = 1 if phrase.name is None else 0 # Handle IF NOT EXISTS. if core.bayesdb_has_population(bdb, population_name): if phrase.ifnotexists: return else: raise BQLError(bdb, 'Name already defined as population: %r' % (population_name,)) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) # Create the population record and get the assigned id. bdb.sql_execute(''' INSERT INTO bayesdb_population (name, tabname, implicit) VALUES (?, ?, ?) ''', (population_name, phrase.table, implicit)) population_id = core.bayesdb_get_population(bdb, population_name) # Extract the population column names and stattypes as pairs. pop_model_vars = list(itertools.chain.from_iterable( [[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list(itertools.chain.from_iterable( [[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list(itertools.chain.from_iterable( [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes]) migrate = [(col, st) for col, st in pop_guess_vars if st=='key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Ensure no string-valued variables are being modeled as numerical. numerical_string_vars = [ var for var, stattype in pop_model_vars if stattype == 'numerical' and _column_contains_string(bdb, phrase.table, var) ] if numerical_string_vars: raise BQLError(bdb, 'Column(s) with string values modeled as numerical: %r' % (numerical_string_vars, )) # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. seen_variables = set() duplicates = set() missing = set() invalid = set() stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in seen_variables: duplicates.add(name) continue if not core.bayesdb_table_has_column(bdb, phrase.table, nm): missing.add(name) continue cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue seen_variables.add(nm) # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),)) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),)) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if stattype == 'ignore': continue core.bayesdb_add_variable(bdb, population_id, name, stattype)
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError(bdb, 'Name already defined as table: %s' % (repr(phrase.name),)) out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError(bdb, 'Table already exists: %s' % (repr(phrase.name),)) bayesdb_read_csv_file( bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name,)) if 0 < cursor_value(cursor): raise BQLError(bdb, 'Table still in use by populations: %s' % (repr(phrase.name),)) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name,)) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table),)) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, 'Name already defined as table: %s' % (repr(cmd.name),)) rename_table(bdb, table, cmd.name) # If table has implicit population, rename it too. if core.bayesdb_table_has_implicit_population( bdb, cmd.name): populations = \ core.bayesdb_table_populations(bdb, cmd.name) assert len(populations) == 1 population_name = core.bayesdb_population_name( bdb, populations[0]) qt = sqlite3_quote_name(cmd.name) qp = sqlite3_quote_name(population_name) bdb.execute('ALTER POPULATION %s RENAME TO %s' % (qp, qt)) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column(bdb, table, cmd.old): raise BQLError(bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError(bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except backends may have the (case-folded) name cached. if old_folded != new_folded: populations_sql = ''' SELECT id FROM bayesdb_population WHERE tabname = ? ''' cursor = bdb.sql_execute(populations_sql, (table,)) generators = [ core.bayesdb_population_generators( bdb, population_id) for (population_id,) in cursor ] for generator_id in set(generators): backend = core.bayesdb_generator_backend(bdb, generator_id) backend.rename_column(bdb, generator_id, old_folded, new_folded) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.GuessSchema): if not core.bayesdb_has_table(bdb, phrase.table): raise BQLError(bdb, 'No such table : %s' % phrase.table) out = compiler.Output(0, {}, {}) with bdb.savepoint(): qt = sqlite3_quote_name(phrase.table) temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,)) column_names = [d[0] for d in cursor.description] rows = cursor.fetchall() stattypes = bayesdb_guess_stattypes(column_names, rows) distinct_value_counts = [ len(set([row[i] for row in rows])) for i in range(len(column_names)) ] out.winder(''' CREATE TEMP TABLE %s ( column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT ) ''' % (qtt,), ()) for cn, st, ct in zip(column_names, stattypes, distinct_value_counts): out.winder(''' INSERT INTO %s VALUES (?, ?, ?, ?) ''' % (qtt), (cn, st[0], ct, st[1])) out.write('SELECT * FROM %s' % (qtt,)) out.unwinder('DROP TABLE %s' % (qtt,), ()) winders, unwinders = out.getwindings() return execute_wound( bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.CreatePop): with bdb.savepoint(): _create_population(bdb, phrase) return empty_cursor(bdb) if isinstance(phrase, ast.DropPop): with bdb.savepoint(): if not core.bayesdb_has_population(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such population: %r' % (phrase.name,)) population_id = core.bayesdb_get_population(bdb, phrase.name) generator_ids = core.bayesdb_population_generators( bdb, population_id) if generator_ids: generators = [core.bayesdb_generator_name(bdb, gid) for gid in generator_ids] raise BQLError(bdb, 'Population %r still has generators: %r' % (phrase.name, generators)) # XXX helpful error checking if generators still exist # XXX check change counts bdb.sql_execute(''' DELETE FROM bayesdb_variable WHERE population_id = ? ''', (population_id,)) bdb.sql_execute(''' DELETE FROM bayesdb_population WHERE id = ? ''', (population_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterPop): with bdb.savepoint(): population = phrase.population if not core.bayesdb_has_population(bdb, population): raise BQLError(bdb, 'No such population: %s' % (repr(population),)) population_id = core.bayesdb_get_population(bdb, population) for cmd in phrase.commands: if isinstance(cmd, ast.AlterPopRenamePop): table = core.bayesdb_population_table(bdb, population_id) # Prevent renaming of implicit population directly, unless # being called by ast.AlterTabRenameTab in which case the # table name and population name will not be matching. if core.bayesdb_population_is_implicit(bdb, population_id) \ and casefold(population) == casefold(table): raise BQLError(bdb, 'Cannot rename implicit' 'population %s; rename base table instead' % (population,)) # Make sure nothing else has this name. if casefold(population) != casefold(cmd.name): if core.bayesdb_has_population(bdb, cmd.name): raise BQLError(bdb, 'Name already defined as population' ': %s' % (repr(cmd.name),)) # Update bayesdb_population. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_population SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, population_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # If population has implicit generator, rename it too. if core.bayesdb_population_has_implicit_generator( bdb, population_id): generators = core.bayesdb_population_generators( bdb, population_id) assert len(generators) == 1 generator_name = core.bayesdb_generator_name( bdb, generators[0]) qp = sqlite3_quote_name(cmd.name) qg = sqlite3_quote_name(generator_name) bdb.execute('ALTER GENERATOR %s RENAME TO %s' % (qg, qp,)) # Remember the new name for subsequent commands. population = cmd.name elif isinstance(cmd, ast.AlterPopAddVar): # Ensure column exists in base table. table = core.bayesdb_population_table(bdb, population_id) if not core.bayesdb_table_has_column( bdb, table, cmd.name): raise BQLError(bdb, 'No such variable in base table: %s' % (cmd.name)) # Ensure variable not already in population. if core.bayesdb_has_variable( bdb, population_id, None, cmd.name): raise BQLError(bdb, 'Variable already in population: %s' % (cmd.name)) # Ensure there is at least observation in the column. qt = sqlite3_quote_name(table) qc = sqlite3_quote_name(cmd.name) cursor = bdb.sql_execute( 'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' % (qt, qc)) if cursor_value(cursor) == 0: raise BQLError(bdb, 'Cannot add variable without any values: %s' % (cmd.name)) # If stattype is None, guess. if cmd.stattype is None: cursor = bdb.sql_execute( 'SELECT %s FROM %s' % (qc, qt)) rows = cursor.fetchall() [stattype, reason] = bayesdb_guess_stattypes( [cmd.name], rows)[0] # Fail if trying to model a key. if stattype == 'key': raise BQLError(bdb, 'Values in column %s appear to be keys.' % (cmd.name,)) # Fail if cannot determine a stattype. elif stattype == 'ignore': raise BQLError(bdb, 'Failed to determine a stattype for %s, ' 'please specify one manually.' % (cmd.name,)) # If user specified stattype, ensure it exists. elif not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError(bdb, 'Invalid stattype: %s' % (cmd.stattype)) else: stattype = cmd.stattype # Check that strings are not being modeled as numerical. if stattype == 'numerical' \ and _column_contains_string(bdb, table, cmd.name): raise BQLError(bdb, 'Numerical column contains string values: %r ' % (qc,)) with bdb.savepoint(): # Add the variable to the population. core.bayesdb_add_variable( bdb, population_id, cmd.name, stattype) colno = core.bayesdb_variable_number( bdb, population_id, None, cmd.name) # Add the variable to each (initialized) generator in # the population. generator_ids = filter( lambda g: core.bayesdb_generator_modelnos(bdb, g), core.bayesdb_population_generators( bdb, population_id), ) for generator_id in generator_ids: backend = core.bayesdb_generator_backend( bdb, generator_id) backend.add_column(bdb, generator_id, colno) elif isinstance(cmd, ast.AlterPopStatType): # Check the no generators are defined for this population. generators = core.bayesdb_population_generators( bdb, population_id) if generators: raise BQLError(bdb, 'Cannot update statistical types for population ' '%s, it has generators: %s' % (repr(population), repr(generators),)) # Check all the variables are in the population. unknown = [ c for c in cmd.names if not core.bayesdb_has_variable(bdb, population_id, None, c) ] if unknown: raise BQLError(bdb, 'No such variables in population: %s' % (repr(unknown))) # Check the statistical type is valid. if not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError(bdb, 'Invalid statistical type: %r' % (repr(cmd.stattype),)) # Check that strings are not being modeled as numerical. if cmd.stattype == 'numerical': table = core.bayesdb_population_table( bdb, population_id) numerical_string_vars = [ col for col in cmd.names if _column_contains_string(bdb, table, col) ] if numerical_string_vars: raise BQLError(bdb, 'Columns with string values modeled as ' 'numerical: %r' % (numerical_string_vars,)) # Perform the stattype update. colnos = [ core.bayesdb_variable_number( bdb, population_id, None, c) for c in cmd.names ] qcolnos = ','.join('%d' % (colno,) for colno in colnos) update_stattype_sql = ''' UPDATE bayesdb_variable SET stattype = ? WHERE population_id = ? AND colno IN (%s) ''' % (qcolnos,) bdb.sql_execute( update_stattype_sql, (casefold(cmd.stattype), population_id,)) else: assert False, 'Invalid ALTER POPULATION command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population,)) population_id = core.bayesdb_get_population(bdb, phrase.population) # Find the backend, or use the default. backend_name = phrase.backend if phrase.backend is None: backend_name = 'cgpm' if backend_name not in bdb.backends: raise BQLError(bdb, 'No such backend: %s' % (repr(backend_name),)) backend = bdb.backends[backend_name] # Retrieve the (possibility implicit) generator name. generator_name = phrase.name or phrase.population implicit = 1 if phrase.name is None else 0 with bdb.savepoint(): if core.bayesdb_has_generator(bdb, population_id, generator_name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(generator_name),)) else: # Insert a record into bayesdb_generator and get the # assigned id. bdb.sql_execute(''' INSERT INTO bayesdb_generator (name, population_id, backend, implicit) VALUES (?, ?, ?, ?) ''', (generator_name, population_id, backend.name(), implicit)) generator_id = core.bayesdb_get_generator( bdb, population_id, generator_name) # Do any backend-specific initialization. backend.create_generator(bdb, generator_id, phrase.schema) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, None, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name),)) generator_id = core.bayesdb_get_generator(bdb, None, phrase.name) backend = core.bayesdb_generator_backend(bdb, generator_id) # Backend-specific destruction. backend.drop_generator(bdb, generator_id) # Drop latent variables, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_variable WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id,)) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id,)) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, None, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, None, generator) cmds_generic = [] for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): population_id = core.bayesdb_generator_population( bdb, generator_id) population = core.bayesdb_population_name( bdb, population_id) # Prevent renaming of implicit generator directly, unless # being called by ast.AlterPopRenamePop in which case the # population name and generator name will not be matching. if core.bayesdb_population_is_implicit(bdb, generator_id) \ and casefold(generator) == casefold(population): raise BQLError(bdb, 'Cannot rename implicit ' 'generator; rename base population instead') # Disable modelnos with AlterGenRenameGen. if phrase.modelnos is not None: raise BQLError(bdb, 'Cannot specify models for RENAME') # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_generator(bdb, None, cmd.name): raise BQLError(bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name),)) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name elif isinstance(cmd, ast.AlterGenGeneric): cmds_generic.append(cmd.command) else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) if cmds_generic: modelnos = phrase.modelnos modelnos_invalid = None if modelnos is None else [ modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno) ] if modelnos_invalid: raise BQLError(bdb, 'No such models in generator %s: %s' % (repr(phrase.generator), repr(modelnos))) # Call generic alternations on the backend. backend = core.bayesdb_generator_backend(bdb, generator_id) backend.alter(bdb, generator_id, modelnos, cmds_generic) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator,)) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) modelnos = range(phrase.nmodels) with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError(bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno) VALUES (:generator_id, :modelno) ''' for modelno in modelnos: bdb.sql_execute(insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) # Do backend-specific initialization. backend = core.bayesdb_generator_backend(bdb, generator_id) backend.initialize_models(bdb, generator_id, modelnos) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): # WARNING: It is the backend's responsibility to work in a # transaction. # # WARNING: It is the backend's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the backend can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator,)) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) backend = core.bayesdb_generator_backend(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. backend.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, program=phrase.program) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator( bdb, None, phrase.generator) backend = core.bayesdb_generator_backend(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError(bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) backend.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id,)) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) if isinstance(phrase, ast.Regress): # Retrieve the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population,)) population_id = core.bayesdb_get_population(bdb, phrase.population) # Retrieve the generator generator_id = None if phrase.generator: if not core.bayesdb_has_generator(bdb, population_id, phrase.generator): raise BQLError(bdb, 'No such generator: %r' % (phrase.generator,)) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.generator) # Retrieve the target variable. if not core.bayesdb_has_variable( bdb, population_id, None, phrase.target): raise BQLError(bdb, 'No such variable: %r' % (phrase.target,)) colno_target = core.bayesdb_variable_number( bdb, population_id, None, phrase.target) stattype = core.bayesdb_variable_stattype(bdb, population_id, generator_id, colno_target) if stattype != 'numerical': raise BQLError(bdb, 'Target variable is not numerical: %r' % (phrase.target,)) # Build the given variables. if any(isinstance(col, ast.SelColAll) for col in phrase.givens): # Using * is not allowed to be mixed with other variables. if len(phrase.givens) > 1: raise BQLError(bdb, 'Cannot use (*) with other givens.') colno_givens = core.bayesdb_variable_numbers( bdb, population_id, None) else: if any(isinstance(col, ast.SelColSub) for col in phrase.givens): # Subexpression needs special compiling. out = compiler.Output(n_numpar, nampar_map, bindings) bql_compiler = compiler.BQLCompiler_None() givens = compiler.expand_select_columns( bdb, phrase.givens, True, bql_compiler, out) else: givens = phrase.givens colno_givens = [ core.bayesdb_variable_number( bdb, population_id, None, given.expression.column) for given in givens ] # Build the arguments to bqlfn.bayesdb_simulate. colno_givens_unique = set( colno for colno in colno_givens if colno!= colno_target ) if len(colno_givens_unique) == 0: raise BQLError(bdb, 'No matching given columns.') constraints = [] colnos = [colno_target] + list(colno_givens_unique) nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value modelnos = None if phrase.modelnos is None else str(phrase.modelnos) rows = bqlfn.bayesdb_simulate( bdb, population_id, generator_id, modelnos, constraints, colnos, numpredictions=nsamp) # Retrieve the stattypes. stattypes = [ core.bayesdb_variable_stattype( bdb, population_id, generator_id, colno_given) for colno_given in colno_givens_unique ] # Separate the target values from the given values. target_values = [row[0] for row in rows] given_values = [row[1:] for row in rows] given_names = [ core.bayesdb_variable_name(bdb, population_id, generator_id, given) for given in colno_givens_unique ] # Compute the coefficients. The import to regress_ols is here since the # feature depends on pandas + sklearn, so avoid module-wide import. from bayeslite.regress import regress_ols coefficients = regress_ols( target_values, given_values, given_names, stattypes) # Store the results in a winder. temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) out = compiler.Output(0, {}, {}) out.winder(''' CREATE TEMP TABLE %s (variable TEXT, coefficient REAL); ''' % (qtt,), ()) for variable, coef in coefficients: out.winder(''' INSERT INTO %s VALUES (?, ?) ''' % (qtt), (variable, coef,)) out.write('SELECT * FROM %s ORDER BY variable' % (qtt,)) out.unwinder('DROP TABLE %s' % (qtt,), ()) winders, unwinders = out.getwindings() return execute_wound( bdb, winders, unwinders, out.getvalue(), out.getbindings()) assert False # XXX
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname, create=False, ifnotexists=False, gzipped=None): """Load legacy BayesDB models from a file. Legacy models are from the previous incarnation of BayesDB, before bayeslite. If you did not use the previous incarnation of BayesDB, you need not worry about this. :param bayeslite.BayesDB bdb: BayesDB instance :param str generator: name of generator :param str table: name of table :param str metamodel: name of metamodel, must be ``crosscat`` :param str pathname: pathname of legacy models file :param bool create: if true and `generator` does not exist, create it :param bool ifnotexists: if true and `generator` exists, do it anyway :param bool gzipped: if true, or if ``None`` and `pathname` ends in ``.pkl.gz``, decompress with gzip first """ if metamodel != 'crosscat': raise ValueError('Only crosscat legacy models are supported.') if not create: if ifnotexists: raise ValueError('Not creating generator whether or not exists!') # Load the pickled file -- gzipped, if gzipped is true or if # gzipped is not specified and the file ends in .pkl.gz. pickled = None with open(pathname, 'rb') as f: if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')): with gzip.GzipFile(fileobj=f) as gzf: pickled = pickle.load(gzf) else: pickled = pickle.load(f) # Pick apart the schema and model data. # # XXX Support even older models formats, from before the schema # was included. Not sure exactly how they were structured. if 'schema' not in pickled: raise IOError('Invalid legacy model: missing schema') if 'models' not in pickled: raise IOError('Invalid legacy model: missing models') schema = pickled['schema'] models = pickled['models'] # Make sure the schema looks sensible. Map legacy stattypes # (`cctypes') to modern stattypes. if not isinstance(schema, dict): raise IOError('Invalid legacy model: schema is not a dict') for column_name in schema: column_schema = schema[column_name] if not isinstance(column_schema, dict): raise IOError('Invalid legacy model: column schema is not a dict') if not 'cctype' in column_schema: raise IOError('Invalid legacy model: column schema missing cctype') if column_schema['cctype'] in renamed_column_stattypes: column_schema['cctype'] = \ renamed_column_stattypes[column_schema['cctype']] if column_schema['cctype'] not in allowed_column_stattypes: raise IOError('Invalid legacy model: unknown column type') # XXX Check whether the schema resembles a sane generator schema. # XXX Check whether models is a dict mapping integers to thetas. # XXX Check whether the thetas look sensible. # XXX Check whether the metamodel makes sense of it! column_stattypes = dict((casefold(column_name), casefold(schema[column_name]['cctype'])) for column_name in schema) # Ready to update the database. Do it in a savepoint in case # anything goes wrong. with bdb.savepoint(): # Ensure the table exists. Can't do anything if we have no # data. if not core.bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (repr(table),)) # Ensure the generator exists. if core.bayesdb_has_generator(bdb, generator): if create and not ifnotexists: raise ValueError('Generator already exists: %s' % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, generator) generator_table = core.bayesdb_generator_table(bdb, generator_id) if casefold(table) != generator_table: raise ValueError( 'Generator %r is for table %r, not for table: %r' % (generator, generator_table, table)) # Generator exists. If the schema differs and there are # existing models, fail. If the schema differs and there # are no existing models, change the schema. # # XXX Not clear changing the schema is really appropriate. generator_id = core.bayesdb_get_generator(bdb, generator) old_types = bayesdb_generator_column_stattypes(bdb, generator_id) if column_stattypes != old_types: sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(bdb, (generator_id,)) if 0 < cursor_value(cursor): raise ValueError('Legacy models mismatch schema: %s' % (repr(generator),)) qg = sqlite3_quote_name(generator) bdb.execute('DROP GENERATOR %s' % (qg,)) bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) elif create: bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) else: raise ValueError('No such generator: %s' % (repr(generator),)) # Map the case of the column names in the models. # # XXX Check more than just the column names. for modelno in models: # dictionary theta = models[modelno] if 'X_L' not in theta: raise IOError('Invalid legacy model: no X_L in theta[%u]' % (modelno,)) X_L = theta['X_L'] if 'view_state' not in X_L: raise IOError('Invalid legacy model' ': no view_state in X_L[%u]' % (modelno,)) for viewno, view_state in enumerate(X_L['view_state']): if 'column_names' not in view_state: raise IOError('Invalid legacy model: no column names' ' in view state %u of X_L[%u]' % (viewno, modelno)) view_column_names = view_state['column_names'] if not isinstance(view_column_names, list): raise IOError('Invalid legacy model' ': non-list for view %u columns in X_L[%u]' % (viewno, modelno)) for i in range(len(view_column_names)): name = view_column_names[i] if not core.bayesdb_table_has_column(bdb, table, name): raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) # Canonicalize the case. colno = core.bayesdb_table_column_number(bdb, table, name) name = core.bayesdb_table_column_name(bdb, table, colno) view_column_names[i] = name # Determine where to start numbering the new models. generator_id = core.bayesdb_get_generator(bdb, generator) modelno_max_sql = ''' SELECT MAX(modelno) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(modelno_max_sql, (generator_id,)) modelno_max = cursor_value(cursor) modelno_start = 0 if modelno_max is None else modelno_max + 1 # Consistently number the models consecutively in order of the # external numbering starting at the smallest nonnegative # model number not currently used. Do not vary based on the # ordering of Python dict iteration. insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' insert_theta_json_sql = ''' INSERT INTO bayesdb_crosscat_theta (generator_id, modelno, theta_json) VALUES (:generator_id, :modelno, :theta_json) ''' for i, modelno_ext in enumerate(sorted(models.keys())): modelno = modelno_start + i theta = models[modelno_ext] iterations = 0 if 'iterations' in theta and isinstance(theta['iterations'], int): iterations = theta['iterations'] bdb.sql_execute(insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': iterations, }) bdb.sql_execute(insert_theta_json_sql, { 'generator_id': generator_id, 'modelno': modelno, 'theta_json': json.dumps(theta), })
def parse(self, schema): """Parse the given `schema` for a `composer` metamodel. An example of a schema is:: CREATE GENERATOR foo FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Launch_Mass_kg, Purpose ), dependent(Launch_Mass_kg, Dry_Mass_kg, Power_watts), dependent(Perigee_km, Apogee_km), independent(Operator_Owner, Inclination_radians) ); The schema must adhere to the following rules: - Default metamodel is identified `default` or `crosscat`. Every `colname` must have its `stattype` declared. IGNORE and GUESS(*) are forbidden. - Foriegn predictors are identified by the `name()` method of the object used when `Composer.register_foreign_predictor` was invoked. For example:: >> from bdbcontrib.foreign.random_forest import RandomForest >> composer.register_foreign_predictor(random_forest.RandomForest) >> RandomForest.name() random_forest The grammar inside foreign predictor directives is:: <target> <stattype> GIVEN <condition> [...[condition]] All columns specified in `dependent` and `independent` directives must be modeled by the `default` metamodel. Parameters ---------- schema : list<list> The `schema` as parsed by bayesdb. Returns ------- columns : dict(str:str) A dict(colname:stattype) mapping every `colname` declared in `schema` to its `stattype`. lcols : list<str> A list of columns modeled by `default` model. fcols : list<str> A list of columns modeled by foreign predictor. fcol_to_pcols : dict(str:list<str>) A dict(fcol:conditions) mapping `fcol` to a list of its parent columns. fcol_to_fpred : dict(str:str) A dict(fcol:fpred) mapping `fcol` to the name of its foreign predictor. The values in the dictionary are keys in `self.predictor_builder`. dependencies : list(<tuple(<bool>,<list<str>)>) A list of dependency constraints. Each entry in the list is a tuple. For example, (True, ['foo', 'bar', 'baz']) means the three variables are mutually and pairwise *dependent*. """ # Allowed keywords. DIRECTIVES = ['crosscat', 'default', 'dependent', 'independent'] + \ self.predictor_builder.keys() STATTYPES = ['numerical', 'categorical'] # Data structures to return. columns = {} lcols = [] fcols = [] fcol_to_pcols = dict() fcol_to_fpred = dict() dependencies = [] # Parse! for block in schema: if len(block) == 0: continue directive = casefold(block[0]) commands = block[1] if directive not in DIRECTIVES: raise ValueError('Unknown directive "{}".\n' 'Available directives: {}.'.format(directive, DIRECTIVES)) if not isinstance(commands, list): raise ValueError('Unknown commands in "{}" directive: {}.'\ .format(directive, commands)) if directive == 'default' or directive == 'crosscat': while commands: c = casefold(commands.pop(0)) if c == ',': continue s = casefold(commands.pop(0)) if s not in STATTYPES: raise ValueError('Invalid stattype "{}".'.format(s)) columns[c] = s lcols.append(c) elif directive == 'independent': ind = [] while commands: c = casefold(commands.pop(0)) if c == ',': continue ind.append(c) dependencies.append((False, ind)) elif directive == 'dependent': dep = [] while commands: c = casefold(commands.pop(0)) if c == ',': continue dep.append(c) dependencies.append((True, dep)) elif directive in self.predictor_builder: c = casefold(commands.pop(0)) s = casefold(commands.pop(0)) if s not in STATTYPES: raise ValueError('Invalid stattype "{}".'.format(s)) columns[c] = s given = casefold(commands.pop(0)) if given != 'given': raise ValueError('Execpted GIVEN keyword, received: {}.'\ .format(given)) conditions = [] while commands: r = casefold(commands.pop(0)) if r == ',': continue conditions.append(r) fcols.append(c) fcol_to_pcols[c] = conditions fcol_to_fpred[c] = directive # Unique lcols. if len(lcols) != len(set(lcols)): raise ValueError('Duplicate default columns enountered: {}.'\ .format(lcols)) # Unique fcols. if len(fcols) != len(set(fcols)): raise ValueError('Duplicate foreign columns enountered: {}.'\ .format(fcols)) # All stattypes declared. for _, c in fcol_to_pcols.iteritems(): for r in c: if r not in columns: raise ValueError('No stattype declaration for "{}".'\ .format(r)) # No col both lcol and fcol. for l in lcols: if l in fcol_to_pcols: raise ValueError('Column "{}" can only be modeled once.'\ .format(l)) # No non-default dependencies. for dep in dependencies: for col in dep[1]: if col not in lcols: raise ValueError('Column "{}" with dependency constraint ' 'must have default model.'.format(col)) # Return the hodgepodge. return (columns, lcols, fcols, fcol_to_pcols, fcol_to_fpred, dependencies)
def bayesdb_has_stattype(bdb, stattype): """True if `stattype` is registered in `bdb` instance.""" sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype' cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)}) return cursor_value(cursor) > 0
def bayesdb_stattype_affinity(_bdb, stattype): assert bayesdb_has_stattype(_bdb, stattype) return _STATTYPE_TO_AFFINITY[casefold(stattype)]
def p_dist_name(self, dist): return casefold(dist) def p_foreign_name(self, foreign): return casefold(foreign)
def bayesdb_read_csv(bdb, table, f, header=False, create=False, ifnotexists=False): """Read CSV data from a line iterator into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param iterable f: iterator returning lines as :class:`str` :param bool header: if true, first line specifies column names :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true and `table` exists, do it anyway """ if not header: if create: raise ValueError('Can\'t create table from headerless CSV!') if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table),)) elif not create: raise ValueError('No such table: %s' % (repr(table),)) reader = csv.reader(f) line = 1 if header: row = None try: row = reader.next() except StopIteration: raise IOError('Missing header in CSV file') line += 1 column_names = [unicode(name, 'utf8').strip() for name in row] if len(column_names) == 0: raise IOError('No columns in CSV file!') if any(len(c)==0 for c in column_names): raise IOError( 'Missing column names in header: %s' %repr(column_names)) column_name_map = {} duplicates = set([]) for name in column_names: name_folded = casefold(name) if name_folded in column_name_map: duplicates.add(name_folded) else: column_name_map[name_folded] = name if 0 < len(duplicates): raise IOError('Duplicate columns in CSV: %s' % (repr(list(duplicates)),)) if create and not core.bayesdb_has_table(bdb, table): qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: core.bayesdb_table_guarantee_columns(bdb, table) unknown = set(name for name in column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise IOError('Unknown columns: %s' % (list(unknown),)) else: assert not create assert not ifnotexists column_names = core.bayesdb_table_column_names(bdb, table) ncols = len(column_names) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) # XXX Would be nice if we could prepare this statement before # reading any rows in order to check whether there are missing # nonnull columns with no default value. However, the only # way to prepare a statement in the Python wrapper is to # execute a cursor, which also binds and steps the statement. sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in reader: if len(row) < ncols: raise IOError('Line %d: Too few columns: %d < %d' % (line, len(row), ncols)) if len(row) > ncols: raise IOError('Line %d: Too many columns: %d > %d' % (line, len(row), ncols)) bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
def p_var_name(self, var): return casefold(var) def p_stattype_s(self, st): return st
def bayesdb_guess_stattypes(column_names, rows, null_values=None, numcat_count=None, numcat_ratio=None, distinct_ratio=None, nullify_ratio=None, overrides=None): """Heuristically guess statistical types for the data in `rows`. Return a list of statistical types corresponding to the columns named in the list `column_names`. :param set null_values: values to nullify. :param int numcat_count: number of distinct values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param real numcat_ratio: ratio of distinct values to total values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param real distinct_ratio: ratio of distinct values to total values above which a column will be ignored as a pseudo-key (only if count > numcat_count). :param real nullify_ratio: ratio of count of the most numerous value to total number of values above which the most numerous value should be nullified (set to 1 to turn off). :param list overrides: list of ``(name, stattype)``, overriding any guessed statistical type for columns by those names In addition to statistical types, the overrides may specify ``key`` or ``ignore``. """ # Fill in default arguments. if null_values is None: null_values = set(("", "N/A", "none", "None")) if numcat_count is None: numcat_count = 20 if numcat_ratio is None: numcat_ratio = 0.02 if distinct_ratio is None: distinct_ratio = 0.9 if nullify_ratio is None: nullify_ratio = 0.9 if overrides is None: overrides = [] # Build a set of the column names. column_name_set = set() duplicates = set() for name in column_names: if casefold(name) in column_name_set: duplicates.add(name) column_name_set.add(casefold(name)) if 0 < len(duplicates): raise ValueError('Duplicate column names: %s' % (repr(list(duplicates),))) # Build a map for the overrides. # # XXX Support more than just stattype: allow arbitrary column # descriptions. override_map = {} unknown = set() duplicates = set() for name, stattype in overrides: if casefold(name) not in column_name_set: unknown.add(name) continue if casefold(name) in override_map: duplicates.add(name) continue override_map[casefold(name)] = casefold(stattype) if 0 < len(unknown): raise ValueError('Unknown columns overridden: %s' % (repr(list(unknown)),)) if 0 < len(duplicates): raise ValueError('Duplicate columns overridden: %s' % (repr(list(duplicates)),)) # Sanity-check the inputs. ncols = len(column_names) assert ncols == len(unique(map(casefold, column_names))) for ri, row in enumerate(rows): if len(row) < ncols: raise ValueError('Row %d: Too few columns: %d < %d' % (ri, len(row), ncols)) if len(row) > ncols: raise ValueError('Row %d: Too many columns: %d > %d' % (ri, len(row), ncols)) # Find a key first, if it has been specified as an override. key = None duplicate_keys = set() for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: if override_map[casefold(column_name)] == 'key': if key is not None: duplicate_keys.add(column_name) continue column = [row[ci] for row in rows] ints = integerify(column) if ints: column = ints if not keyable_p(column): raise ValueError('Column non-unique but specified as key' ': %s' % (repr(column_name),)) key = column_name if 0 < len(duplicate_keys): raise ValueError('Multiple columns overridden as keys: %s' % (repr(list(duplicate_keys)),)) # Now go through and guess the other column stattypes or use the # override. stattypes = [] for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: stattype = override_map[casefold(column_name)] else: column = nullify(null_values, rows, ci) stattype = guess_column_stattype(column, distinct_ratio=distinct_ratio, nullify_ratio=nullify_ratio, numcat_count=numcat_count, numcat_ratio=numcat_ratio, have_key=(key is not None)) if stattype == 'key': key = column_name stattypes.append(stattype) return stattypes
def _is_nominal(stattype): return casefold(stattype) in ['nominal', 'unbounded_nominal']
def dot_describe(self, line): """describe BayesDB entities [table(s)|generator(s)|columns|model(s)] [<name>...] Print a human-readable description of the specified BayesDB entities. """ # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) == 0: self.stdout.write("Usage: .describe table(s) [<table>...]\n") self.stdout.write(" .describe generator(s) [<gen>...]\n") self.stdout.write(" .describe columns <gen>\n") self.stdout.write(" .describe model(s) <gen> [<model>...]\n") return if casefold(tokens[0]) == "table" or casefold(tokens[0]) == "tables": params = None qualifier = None if len(tokens) == 1: params = () qualifier = "1" else: params = tokens[1:] qualifier = "(" + " OR ".join(["tabname = ?" for _p in params]) + ")" ok = True for table in params: if not core.bayesdb_has_table(self._bdb, table): self.stdout.write("No such table: %s\n" % (repr(table),)) ok = False if not ok: return for table in params: core.bayesdb_table_guarantee_columns(self._bdb, table) sql = """ SELECT tabname, colno, name, shortname FROM bayesdb_column WHERE %s ORDER BY tabname ASC, colno ASC """ % ( qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params)) elif casefold(tokens[0]) == "generator" or casefold(tokens[0]) == "generators": params = None qualifier = None if len(tokens) == 1: params = () qualifier = "1" else: params = tokens[1:] names = ",".join("?%d" % (i + 1,) for i in range(len(params))) qualifier = """ (name IN ({names}) OR (defaultp AND tabname IN ({names}))) """.format( names=names ) ok = True for generator in params: if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write("No such generator: %s\n" % (repr(generator),)) ok = False if not ok: return sql = """ SELECT id, name, tabname, metamodel FROM bayesdb_generator WHERE %s """ % ( qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params)) elif casefold(tokens[0]) == "columns": if len(tokens) != 2: self.stdout.write("Describe columns of what generator?\n") return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write("No such generator: %s\n" % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) sql = """ SELECT c.colno AS colno, c.name AS name, gc.stattype AS stattype, c.shortname AS shortname FROM bayesdb_generator AS g, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_generator_column AS gc USING (colno)) WHERE g.id = ? AND g.id = gc.generator_id AND g.tabname = c.tabname ORDER BY colno ASC; """ cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == "model" or casefold(tokens[0]) == "models": if len(tokens) < 2: self.stdout.write("Describe models of what generator?\n") return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write("No such generator: %s\n" % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) qualifier = None if len(tokens) == 2: qualifier = "1" else: modelnos = [] for token in tokens[2:]: try: modelno = int(token) except ValueError: self.stdout.write("Invalid model number: %s\n" % (repr(token),)) return else: if not core.bayesdb_generator_has_model(self._bdb, generator_id, modelno): self.stdout.write("No such model: %d\n" % (modelno,)) return modelnos.append(modelno) qualifier = "modelno IN (%s)" % (",".join(map(str, modelnos))) sql = """ SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? AND %s """ % ( qualifier, ) cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) else: self.stdout.write("Usage: .describe table(s) [<table>...]\n") self.stdout.write(" .describe generator(s) [<gen>...]\n") self.stdout.write(" .describe columns <gen>\n") self.stdout.write(" .describe model(s) <gen> [<model>...]\n")
def _is_countable(stattype): return casefold(stattype) in ['counts', 'boolean']
def instantiate_generator(bdb, gen_name, table, metamodel, columns, default=None): if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError(bdb, 'Name already defined as table: %s' % (repr(gen_name),)) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) generator_already_existed = False if core.bayesdb_has_generator(bdb, gen_name): generator_already_existed = True else: # Create the generator record. generator_sql = '''INSERT INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp)''' cursor = bdb.sql_execute(generator_sql, { 'name': gen_name, 'table': table, 'metamodel': metamodel.name(), 'defaultp': default, }) generator_id = core.bayesdb_get_generator(bdb, gen_name) assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, { 'stattype': stattype, }) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, 'No such columns in table %s: %s' % (repr(table), repr(list(missing)))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %s' % (repr(list(duplicates)),)) if invalid: raise BQLError(bdb, 'Invalid statistical types: %s' % (repr(list(invalid)),)) if not generator_already_existed: # Insert column records. column_sql = ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) ''' for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute(column_sql, { 'generator_id': generator_id, 'colno': colno, 'stattype': stattype, }) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list
def scan_name(_scanner, text): return keywords.get(text) or keywords.get(casefold(text)) or \ grammar.L_NAME;
def define_correlation_p(stattype0, stattype1, method): assert casefold(stattype0) == stattype0 assert casefold(stattype1) == stattype1 assert (stattype0, stattype1) not in correlation_p_methods correlation_p_methods[stattype0, stattype1] = method
def bayesdb_read_csv(bdb, table, f, header=False, create=False, ifnotexists=False): """Read CSV data from a line iterator into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param iterable f: iterator returning lines as :class:`str` :param bool header: if true, first line specifies column names :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true and `table` exists, do it anyway """ if not header: if create: raise ValueError('Can\'t create table from headerless CSV!') if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table), )) elif not create: raise ValueError('No such table: %s' % (repr(table), )) reader = csv.reader(f) line = 1 if header: row = None try: row = reader.next() except StopIteration: raise IOError('Missing header in CSV file') line += 1 column_names = [unicode(name, 'utf8').strip() for name in row] if len(column_names) == 0: raise IOError('No columns in CSV file!') column_name_map = {} duplicates = set([]) for name in column_names: name_folded = casefold(name) if name_folded in column_name_map: duplicates.add(name_folded) else: column_name_map[name_folded] = name if 0 < len(duplicates): raise IOError('Duplicate columns in CSV: %s' % (repr(list(duplicates)), )) if create and not core.bayesdb_has_table(bdb, table): qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: core.bayesdb_table_guarantee_columns(bdb, table) unknown = set( name for name in column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise IOError('Unknown columns: %s' % (list(unknown), )) else: assert not create assert not ifnotexists column_names = core.bayesdb_table_column_names(bdb, table) ncols = len(column_names) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) # XXX Would be nice if we could prepare this statement before # reading any rows in order to check whether there are missing # nonnull columns with no default value. However, the only # way to prepare a statement in the Python wrapper is to # execute a cursor, which also binds and steps the statement. sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in reader: if len(row) < ncols: raise IOError('Line %d: Too few columns: %d < %d' % (line, len(row), ncols)) if len(row) > ncols: raise IOError('Line %d: Too many columns: %d > %d' % (line, len(row), ncols)) bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
try: cursor.next() except StopIteration: pass else: generator = bayesdb_generator_table(bdb, generator_id) raise BQLError(bdb, 'More than one such row' ' in table %s for generator %s: %d' % (repr(table_name), repr(generator), repr(rowid))) return row def bayesdb_generator_fresh_row_id(bdb, generator_id): table_name = bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,)) max_rowid = cursor_value(cursor) if max_rowid is None: max_rowid = 0 return max_rowid + 1 # Synthesize a non-existent SQLite row id # XXX This should be stored in the database by adding a column to the # bayesdb_stattype table -- when we are later willing to contemplate # adding statistical types, e.g. COUNT, SCALE, or NONNEGATIVE REAL. _STATTYPE_TO_AFFINITY = dict((casefold(st), casefold(af)) for st, af in ( ('categorical', 'text'), ('cyclic', 'real'), ('numerical', 'real'), )) def bayesdb_stattype_affinity(_bdb, stattype): return _STATTYPE_TO_AFFINITY[casefold(stattype)]
def _create_schema(bdb, generator_id, schema_ast, **kwargs): # Get some parameters. population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) # State. variables = [] variable_dist = {} latents = {} cgpm_composition = [] modelled = set() default_modelled = set() subsample = None deferred_input = defaultdict(lambda: []) deferred_output = dict() # Error-reporting state. duplicate = set() unknown = set() needed = set() existing_latent = set() must_exist = [] unknown_stattype = {} # XXX Convert all Foreign.exposed lists to Latent clauses. # Retrieve Foreign clauses with exposed variables. foreign_clauses = [ c for c in schema_ast if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0 ] # Add the exposed variables to Foreign.outputs # Note that this assumes if there are K exposed variables, then they are # necessarily the last K outputs of the fc.outputs. for fc in foreign_clauses: fc.outputs.extend([e[0] for e in fc.exposed]) # Convert exposed entries into Latent clauses. latent_vars = list( itertools.chain.from_iterable(c.exposed for c in foreign_clauses)) latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars] # Append the Latent clauses to the ast. schema_ast.extend(latent_clauses) # XXX Convert the baseline to a Foreign clause. # Currently the baselines do not accept a schema, and will fail if # `schema_ast` has any entries. baseline = kwargs.get('baseline', None) if baseline is not None and casefold(baseline.name) != 'crosscat': if schema_ast: raise BQLError( bdb, 'Cannot accept schema with baseline: %s.' % schema_ast) # Retrieve all variable names in the population outputs = core.bayesdb_variable_names(bdb, population_id, None) # Convert the LITERAL namedtuples to their raw values. ps, vs = zip(*baseline.params) vs_new = [v.value for v in vs] params = zip(ps, vs_new) # Create the clause. clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name, params) # And add append it to the schema_ast. schema_ast.append(clause) # Process each clause one by one. for clause in schema_ast: if isinstance(clause, cgpm_schema.parse.Basic): # Basic Crosscat component model: one variable to be put # into Crosscat views. var = clause.var dist = clause.dist params = dict(clause.params) # XXX error checking # Reject if the variable does not exist. if not core.bayesdb_has_variable(bdb, population_id, None, var): unknown.add(var) continue # Reject if the variable has already been modelled. if var in modelled: duplicate.add(var) continue # Reject if the variable is latent. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Get the column number. colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno # Add it to the list and mark it modelled by default. stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) default_modelled.add(var) elif isinstance(clause, cgpm_schema.parse.Latent): var = clause.name stattype = clause.stattype # Reject if the variable has already been modelled by the # default model. if var in default_modelled: duplicate.add(var) continue # Reject if the variable even *exists* in the population # at all yet. if core.bayesdb_has_variable(bdb, population_id, None, var): duplicate.add(var) continue # Reject if the variable is already latent, from another # generator. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Reject if we've already processed it. if var in latents: duplicate.add(var) continue # Add it to the set of latent variables. latents[var] = stattype elif isinstance(clause, cgpm_schema.parse.Foreign): # Foreign model: some set of output variables is to be # modelled by foreign logic, possibly conditional on some # set of input variables. # # Gather up the state for a cgpm_composition record, which # we may have to do incrementally because it must refer to # the distribution types of variables we may not have # seen. name = clause.name outputs = clause.outputs inputs = clause.inputs output_stattypes = [] output_statargs = [] input_stattypes = [] input_statargs = [] distargs = { 'inputs': { 'stattypes': input_stattypes, 'statargs': input_statargs }, 'outputs': { 'stattypes': output_stattypes, 'statargs': output_statargs, } } kwds = {'distargs': distargs} kwds.update(clause.params) # First make sure all the output variables exist and have # not yet been modelled. for var in outputs: must_exist.append(var) if var in modelled: duplicate.add(var) continue modelled.add(var) # Add the output statistical type and its parameters. i = len(output_stattypes) assert i == len(output_statargs) output_stattypes.append(None) output_statargs.append(None) deferred_output[var] = (output_stattypes, output_statargs, i) # Next make sure all the input variables exist, mark them # needed, and record where to put their distribution type # and parameters. for var in inputs: must_exist.append(var) needed.add(var) i = len(input_stattypes) assert i == len(input_statargs) input_stattypes.append(None) input_statargs.append(None) deferred_input[var].append( (input_stattypes, input_statargs, i)) # Finally, add a cgpm_composition record. cgpm_composition.append({ 'name': name, 'inputs': inputs, 'outputs': outputs, 'kwds': kwds, }) elif isinstance(clause, cgpm_schema.parse.Subsample): if subsample is not None: raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, )) subsample = clause.n else: raise BQLError(bdb, 'Unknown clause: %r' % (clause, )) # Make sure all the outputs and inputs exist, either in the # population or as latents in this generator. for var in must_exist: if core.bayesdb_has_variable(bdb, population_id, None, var): continue if var in latents: continue unknown.add(var) # Raise an exception if there were duplicates or unknown # variables. if duplicate: raise BQLError(bdb, 'Duplicate model variables: %r' % (sorted(duplicate), )) if existing_latent: raise BQLError( bdb, 'Latent variables already defined: %r' % (sorted(existing_latent), )) if unknown: raise BQLError(bdb, 'Unknown model variables: %r' % (sorted(unknown), )) def default_dist(var, stattype): stattype = casefold(stattype) if stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == stattype else: unknown_stattype[var] = stattype return None dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var) return dist, params # Use the default distribution for any variables that remain to be # modelled, excluding any that are latent or that have statistical # types we don't know about. for var in core.bayesdb_variable_names(bdb, population_id, None): if var in modelled: continue colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) # Fill in the deferred_input statistical type assignments. for var in sorted(deferred_input.iterkeys()): # Check whether the variable is modelled. If not, skip -- we # will fail later because this variable is guaranteed to also # be in needed. if var not in modelled: assert var in needed continue # Determine (possibly fictitious) distribution and parameters. if var in default_modelled: # Manifest variable modelled by default Crosscat model. assert var in variable_dist stattype, dist, params = variable_dist[var] else: # Modelled by a foreign model. Assign a fictitious # default distribution because the 27B/6 of CGPM requires # this. if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. stattype = latents[var] else: # Manifest variable modelled by a foreign model. Use # the statistical type in the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams # Assign the distribution and parameters. for cctypes, ccargs, i in deferred_input[var]: assert cctypes[i] is None assert ccargs[i] is None cctypes[i] = dist ccargs[i] = params # Fill in the deferred_output statistical type assignments. The need to be # in the form NUMERICAL or CATEGORICAL. for var in deferred_output: if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. var_stattype = casefold(latents[var]) if var_stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == var_stattype else: unknown_stattype[var] = var_stattype # XXX Cannot specify statargs for a latent variable. Trying to using # default_dist might lookup the counts for unique values of the # categorical in the base table causing a failure. var_statargs = {} else: # Manifest variable modelled by a foreign model. Use # the statistical type and arguments from the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) var_stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, var_stattype) if distparams is None: continue _, var_statargs = distparams stattypes, statargs, i = deferred_output[var] assert stattypes[i] is None assert statargs[i] is None stattypes[i] = var_stattype statargs[i] = var_statargs if unknown_stattype: raise BQLError( bdb, 'Unknown statistical types for variables: %r' % (sorted(unknown_stattype.iteritems(), ))) # If there remain any variables that we needed to model, because # others are conditional on them, fail. needed -= modelled if needed: raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, )) # Finally, create a CGPM schema. return { 'variables': variables, 'cgpm_composition': cgpm_composition, 'subsample': subsample, 'latents': latents, }
def instantiate_generator(bdb, gen_name, table, metamodel, columns, default=None): if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(gen_name), )) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) generator_already_existed = False if core.bayesdb_has_generator(bdb, gen_name): generator_already_existed = True else: # Create the generator record. generator_sql = '''INSERT INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp)''' cursor = bdb.sql_execute( generator_sql, { 'name': gen_name, 'table': table, 'metamodel': metamodel.name(), 'defaultp': default, }) generator_id = core.bayesdb_get_generator(bdb, gen_name) assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, { 'stattype': stattype, }) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %s: %s' % (repr(table), repr(list(missing)))) if duplicates: raise BQLError( bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), )) if invalid: raise BQLError( bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), )) if not generator_already_existed: # Insert column records. column_sql = ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) ''' for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute( column_sql, { 'generator_id': generator_id, 'colno': colno, 'stattype': stattype, }) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list
def instantiate_generator(bdb, gen_name, table, metamodel, columns, ifnotexists=None, default=None): if ifnotexists is None: ifnotexists = False if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError(bdb, "Name already defined as table: %s" % (repr(gen_name),)) # Make sure there's no generator by this name unless we were asked # to redefine it in that case. if not ifnotexists and core.bayesdb_has_generator(bdb, gen_name): raise BQLError(bdb, "Name already defined as generator: %s" % (repr(gen_name),)) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) # Create the generator record. generator_sql = """ INSERT%s INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp) """ % ( " OR IGNORE" if ifnotexists else "", ) cursor = bdb.sql_execute( generator_sql, {"name": gen_name, "table": table, "metamodel": metamodel.name(), "defaultp": default} ) generator_id = cursor.lastrowid assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = """ SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name """ stattype_sql = """ SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype """ for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, {"table": table, "column_name": name}) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, {"stattype": stattype}) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, "No such columns in table %s: %s" % (repr(table), repr(list(missing)))) if duplicates: raise BQLError(bdb, "Duplicate column names: %s" % (repr(list(duplicates)),)) if invalid: raise BQLError(bdb, "Invalid statistical types: %s" % (repr(list(invalid)),)) # Insert column records. column_sql = """ INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) """ for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute(column_sql, {"generator_id": generator_id, "colno": colno, "stattype": stattype}) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list
def dot_describe(self, line): '''describe BayesDB entities [table(s)|generator(s)|columns|model(s)] [<name>...] Print a human-readable description of the specified BayesDB entities. ''' # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) == 0: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe columns <gen>\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n') return if casefold(tokens[0]) == 'table' or \ casefold(tokens[0]) == 'tables': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] qualifier = \ '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')' ok = True for table in params: if not core.bayesdb_has_table(self._bdb, table): self.stdout.write('No such table: %s\n' % (repr(table),)) ok = False if not ok: return for table in params: core.bayesdb_table_guarantee_columns(self._bdb, table) sql = ''' SELECT tabname, colno, name, shortname FROM bayesdb_column WHERE %s ORDER BY tabname ASC, colno ASC ''' % (qualifier,) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params)) elif casefold(tokens[0]) == 'generator' or \ casefold(tokens[0]) == 'generators': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] names = ','.join('?%d' % (i + 1,) for i in range(len(params))) qualifier = ''' (name IN ({names}) OR (defaultp AND tabname IN ({names}))) '''.format(names=names) ok = True for generator in params: if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write('No such generator: %s\n' % (repr(generator),)) ok = False if not ok: return sql = ''' SELECT id, name, tabname, metamodel FROM bayesdb_generator WHERE %s ''' % (qualifier,) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params)) elif casefold(tokens[0]) == 'columns': if len(tokens) != 2: self.stdout.write('Describe columns of what generator?\n') return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write('No such generator: %s\n' % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) sql = ''' SELECT c.colno AS colno, c.name AS name, gc.stattype AS stattype, c.shortname AS shortname FROM bayesdb_generator AS g, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_generator_column AS gc USING (colno)) WHERE g.id = ? AND g.id = gc.generator_id AND g.tabname = c.tabname ORDER BY colno ASC; ''' cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == 'model' or \ casefold(tokens[0]) == 'models': if len(tokens) < 2: self.stdout.write('Describe models of what generator?\n') return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write('No such generator: %s\n' % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) qualifier = None if len(tokens) == 2: qualifier = '1' else: modelnos = [] for token in tokens[2:]: try: modelno = int(token) except ValueError: self.stdout.write('Invalid model number: %s\n' % (repr(token),)) return else: if not core.bayesdb_generator_has_model( self._bdb, generator_id, modelno): self.stdout.write('No such model: %d\n' % (modelno,)) return modelnos.append(modelno) qualifier = 'modelno IN (%s)' % \ (','.join(map(str, modelnos),)) sql = ''' SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? AND %s ''' % (qualifier,) cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) else: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe columns <gen>\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n')
def dot_describe(self, line): '''describe BayesDB entities [table(s)|generator(s)|columns|model(s)] [<name>...] Print a human-readable description of the specified BayesDB entities. ''' # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) == 0: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe population(s) [<pop>...]\n') self.stdout.write(' .describe variables <pop>\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n') return if casefold(tokens[0]) == 'table' or \ casefold(tokens[0]) == 'tables': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] qualifier = \ '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')' ok = True for table in params: if not core.bayesdb_has_table(self._bdb, table): self.stdout.write('No such table: %s\n' % (repr(table), )) ok = False if not ok: return for table in params: core.bayesdb_table_guarantee_columns(self._bdb, table) sql = ''' SELECT tabname, colno, name, shortname FROM bayesdb_column WHERE %s ORDER BY tabname ASC, colno ASC ''' % (qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params)) elif casefold(tokens[0]) in ('population', 'populations'): params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] names = ','.join('?%d' % (i + 1, ) for i in xrange(len(params))) qualifier = '(name IN (%s))' % (names, ) ok = True for population in params: if not core.bayesdb_has_population(self._bdb, population): self.stdout.write('No such population: %s\n' % (repr(population), )) ok = False if not ok: return with self._bdb.savepoint(): cursor = self._bdb.sql_execute( ''' SELECT id, name, tabname FROM bayesdb_population WHERE %s ''' % (qualifier, ), params) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == 'generator' or \ casefold(tokens[0]) == 'generators': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] names = ','.join('?%d' % (i + 1, ) for i in range(len(params))) qualifier = ''' (name IN ({names})) '''.format(names=names) ok = True for generator in params: if not core.bayesdb_has_generator(self._bdb, None, generator): self.stdout.write('No such generator: %s\n' % (repr(generator), )) ok = False if not ok: return sql = ''' SELECT id, name, tabname, backend FROM bayesdb_generator WHERE %s ''' % (qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params)) elif casefold(tokens[0]) == 'variables': if len(tokens) != 2: self.stdout.write('Usage: .describe variables <population>\n') return population = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_population(self._bdb, population): self.stdout.write('No such population: %r\n' % (population, )) return population_id = core.bayesdb_get_population( self._bdb, population) sql = ''' SELECT c.colno AS colno, c.name AS name, v.stattype AS stattype, c.shortname AS shortname FROM bayesdb_population AS p, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_variable AS v USING (colno)) WHERE p.id = ? AND p.id = v.population_id AND p.tabname = c.tabname ORDER BY colno ASC; ''' cursor = self._bdb.sql_execute(sql, (population_id, )) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == 'model' or \ casefold(tokens[0]) == 'models': if len(tokens) < 2: self.stdout.write('Describe models of what generator?\n') return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator(self._bdb, None, generator): self.stdout.write('No such generator: %s\n' % (repr(generator), )) return generator_id = core.bayesdb_get_generator( self._bdb, None, generator) qualifier = None if len(tokens) == 2: qualifier = '1' else: modelnos = [] for token in tokens[2:]: try: modelno = int(token) except ValueError: self.stdout.write('Invalid model number: %s\n' % (repr(token), )) return else: if not core.bayesdb_generator_has_model( self._bdb, generator_id, modelno): self.stdout.write('No such model: %d\n' % (modelno, )) return modelnos.append(modelno) qualifier = 'modelno IN (%s)' % \ (','.join(map(str, modelnos),)) sql = ''' SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? AND %s ''' % (qualifier, ) cursor = self._bdb.sql_execute(sql, (generator_id, )) pretty.pp_cursor(self.stdout, cursor) else: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe variables <pop>\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n')
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Table already exists: %s' % (repr(phrase.name), )) bayesdb_read_csv_file(bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_population(bdb, phrase.simulation.population): raise BQLError( bdb, 'No such population: %s' % (phrase.simulation.population, )) population_id = core.bayesdb_get_population( bdb, phrase.simulation.population) generator_id = None if phrase.simulation.generator is not None: if not core.bayesdb_has_generator(bdb, population_id, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %r' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.simulation.generator) table = core.bayesdb_population_table(bdb, population_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such variable' ' in population %r: %s' % (phrase.simulation.population, column_name)) for column_name, _expression in phrase.simulation.constraints: cn = casefold(column_name) if (cn not in column_sqltypes and cn not in core.bayesdb_rowid_tokens(bdb)): raise BQLError( bdb, 'No such variable in population %s: %s' % (phrase.simulation.population, column_name)) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) def map_var(var): if casefold(var) not in core.bayesdb_rowid_tokens(bdb): return core.bayesdb_variable_number( bdb, population_id, generator_id, var) else: return casefold(var) def map_constraint(((var, _expression), value)): return (map_var(var), value) constraints = map( map_constraint, zip(phrase.simulation.constraints, cursor[0][1:])) colnos = map(map_var, column_names) schema = ','.join('%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)) bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, schema)) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate( bdb, population_id, constraints, colnos, generator_id=generator_id, numpredictions=nsamples, accuracy=phrase.simulation.accuracy): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb)
def bayesdb_stattype_affinity(_bdb, stattype): return _STATTYPE_TO_AFFINITY[casefold(stattype)]
def _create_population(bdb, phrase): if core.bayesdb_has_population(bdb, phrase.name): if phrase.ifnotexists: return else: raise BQLError( bdb, 'Name already defined as population: %r' % (phrase.name, )) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) seen_columns = [] # Create the population record and get the assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?) ''', (phrase.name, phrase.table)) population_id = core.bayesdb_get_population(bdb, phrase.name) # Extract the population column names and stattypes as pairs. pop_model_vars = list( itertools.chain.from_iterable([[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list( itertools.chain.from_iterable([[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list( itertools.chain.from_iterable([ s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars) ])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, pop_guess_stattypes) migrate = [(col, st) for col, st in pop_guess_vars if st == 'key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Get a map from variable name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. variable_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in variable_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': phrase.table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue variable_map[name] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates), )) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid), )) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) colno = variable_map[name] stattype = casefold(st) if stattype == 'ignore': continue bdb.sql_execute( ''' INSERT INTO bayesdb_variable (population_id, name, colno, stattype) VALUES (?, ?, ?, ?) ''', (population_id, name, colno, stattype))
def _is_categorical(stattype): return casefold(stattype) in ['categorical', 'nominal']
def bayesdb_rowid_tokens(bdb): tokens = bdb.sql_execute(''' SELECT token FROM bayesdb_rowid_tokens ''').fetchall() return [t[0] for t in tokens] def bayesdb_has_stattype(bdb, stattype): sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype' cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)}) return cursor_value(cursor) > 0 # XXX This should be stored in the database by adding a column to the # bayesdb_stattype table -- when we are later willing to contemplate # adding statistical types, e.g. COUNT, SCALE, or NONNEGATIVE REAL. _STATTYPE_TO_AFFINITY = dict((casefold(st), casefold(af)) for st, af in ( ('categorical', 'text'), ('cyclic', 'real'), ('numerical', 'real'), ('counts', 'real'), ('magnitude', 'real'), ('nominal', 'text'), ('numericalranged', 'real'), )) def bayesdb_stattype_affinity(_bdb, stattype): assert bayesdb_has_stattype(_bdb, stattype) return _STATTYPE_TO_AFFINITY[casefold(stattype)]