def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes): column_names = core.bayesdb_table_column_names(bdb, table) qcns = map(sqlite3_quote_name, column_names) assert all(column_stattypes[name] in allowed_column_stattypes for name in column_stattypes) column_name_set = set(casefold(name) for name in column_names) for name in column_stattypes: if name not in column_name_set: raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)]) for name, qcn in zip(column_names, qcns)) qg = sqlite3_quote_name(generator) qt = sqlite3_quote_name(table) qmm = 'crosscat' bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' % (qg, qt, qmm, schema))
def subsample_table_columns(bdb, table, new_table, limit, keep, drop, seed): """Return a subsample of the columns in the table.""" if not bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (table, )) if bayesdb_has_table(bdb, new_table): raise ValueError('Table already exists: %s' % (new_table, )) keep = map(casefold, keep) drop = map(casefold, drop) skip = keep + drop unknown = [ column for column in skip if not bayesdb_table_has_column(bdb, table, column) ] if unknown: raise ValueError('No such columns: %s' % (unknown, )) overlap = [column for column in drop if column in keep] if overlap: raise ValueError('Cannot both drop and keep columns: %s' % (overlap, )) num_sample = limit - len(keep) if num_sample < 0: raise ValueError('Must sample at least as many columns to keep.') subselect_columns = [ column for column in bayesdb_table_column_names(bdb, table) if casefold(column) not in skip ] rng = np.random.RandomState(seed) subsample_columns = rng.choice(subselect_columns, replace=False, size=min(len(subselect_columns), num_sample)) qt = bql_quote_name(table) qnt = bql_quote_name(new_table) qc = ','.join(map(bql_quote_name, itertools.chain(keep, subsample_columns))) cursor = bdb.execute(''' CREATE TABLE %s AS SELECT %s FROM %s ''' % (qnt, qc, qt)) return cursor_to_df(cursor)
def bayesdb_read_csv(bdb, table, f, header=False, create=False, ifnotexists=False): """Read CSV data from a line iterator into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param iterable f: iterator returning lines as :class:`str` :param bool header: if true, first line specifies column names :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true and `table` exists, do it anyway """ if not header: if create: raise ValueError('Can\'t create table from headerless CSV!') if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table), )) elif not create: raise ValueError('No such table: %s' % (repr(table), )) reader = csv.reader(f) line = 1 if header: row = None try: row = reader.next() except StopIteration: raise IOError('Missing header in CSV file') line += 1 column_names = [unicode(name, 'utf8').strip() for name in row] if len(column_names) == 0: raise IOError('No columns in CSV file!') column_name_map = {} duplicates = set([]) for name in column_names: name_folded = casefold(name) if name_folded in column_name_map: duplicates.add(name_folded) else: column_name_map[name_folded] = name if 0 < len(duplicates): raise IOError('Duplicate columns in CSV: %s' % (repr(list(duplicates)), )) if create and not core.bayesdb_has_table(bdb, table): qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: core.bayesdb_table_guarantee_columns(bdb, table) unknown = set( name for name in column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise IOError('Unknown columns: %s' % (list(unknown), )) else: assert not create assert not ifnotexists column_names = core.bayesdb_table_column_names(bdb, table) ncols = len(column_names) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) # XXX Would be nice if we could prepare this statement before # reading any rows in order to check whether there are missing # nonnull columns with no default value. However, the only # way to prepare a statement in the Python wrapper is to # execute a cursor, which also binds and steps the statement. sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in reader: if len(row) < ncols: raise IOError('Line %d: Too few columns: %d < %d' % (line, len(row), ncols)) if len(row) > ncols: raise IOError('Line %d: Too many columns: %d > %d' % (line, len(row), ncols)) bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
def bayesdb_read_csv(bdb, table, f, header=False, create=False, ifnotexists=False): """Read CSV data from a line iterator into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param iterable f: iterator returning lines as :class:`str` :param bool header: if true, first line specifies column names :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true and `table` exists, do it anyway """ if not header: if create: raise ValueError('Can\'t create table from headerless CSV!') if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table),)) elif not create: raise ValueError('No such table: %s' % (repr(table),)) reader = csv.reader(f) line = 1 if header: row = None try: row = reader.next() except StopIteration: raise IOError('Missing header in CSV file') line += 1 column_names = [unicode(name, 'utf8').strip() for name in row] if len(column_names) == 0: raise IOError('No columns in CSV file!') if any(len(c)==0 for c in column_names): raise IOError( 'Missing column names in header: %s' %repr(column_names)) column_name_map = {} duplicates = set([]) for name in column_names: name_folded = casefold(name) if name_folded in column_name_map: duplicates.add(name_folded) else: column_name_map[name_folded] = name if 0 < len(duplicates): raise IOError('Duplicate columns in CSV: %s' % (repr(list(duplicates)),)) if create and not core.bayesdb_has_table(bdb, table): qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: core.bayesdb_table_guarantee_columns(bdb, table) unknown = set(name for name in column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise IOError('Unknown columns: %s' % (list(unknown),)) else: assert not create assert not ifnotexists column_names = core.bayesdb_table_column_names(bdb, table) ncols = len(column_names) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) # XXX Would be nice if we could prepare this statement before # reading any rows in order to check whether there are missing # nonnull columns with no default value. However, the only # way to prepare a statement in the Python wrapper is to # execute a cursor, which also binds and steps the statement. sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in reader: if len(row) < ncols: raise IOError('Line %d: Too few columns: %d < %d' % (line, len(row), ncols)) if len(row) > ncols: raise IOError('Line %d: Too many columns: %d > %d' % (line, len(row), ncols)) bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
def _create_population(bdb, phrase): if core.bayesdb_has_population(bdb, phrase.name): if phrase.ifnotexists: return else: raise BQLError( bdb, 'Name already defined as population: %r' % (phrase.name, )) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) seen_columns = [] # Create the population record and get the assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?) ''', (phrase.name, phrase.table)) population_id = core.bayesdb_get_population(bdb, phrase.name) # Extract the population column names and stattypes as pairs. pop_model_vars = list( itertools.chain.from_iterable([[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list( itertools.chain.from_iterable([[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list( itertools.chain.from_iterable([ s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars) ])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, pop_guess_stattypes) migrate = [(col, st) for col, st in pop_guess_vars if st == 'key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Get a map from variable name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. variable_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in variable_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': phrase.table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue variable_map[name] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates), )) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid), )) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) colno = variable_map[name] stattype = casefold(st) if stattype == 'ignore': continue bdb.sql_execute( ''' INSERT INTO bayesdb_variable (population_id, name, colno, stattype) VALUES (?, ?, ?, ?) ''', (population_id, name, colno, stattype))
def _create_population(bdb, phrase): # Retrieve the (possibility implicit) population name. population_name = phrase.name or phrase.table implicit = 1 if phrase.name is None else 0 # Handle IF NOT EXISTS. if core.bayesdb_has_population(bdb, population_name): if phrase.ifnotexists: return else: raise BQLError(bdb, 'Name already defined as population: %r' % (population_name,)) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) # Create the population record and get the assigned id. bdb.sql_execute(''' INSERT INTO bayesdb_population (name, tabname, implicit) VALUES (?, ?, ?) ''', (population_name, phrase.table, implicit)) population_id = core.bayesdb_get_population(bdb, population_name) # Extract the population column names and stattypes as pairs. pop_model_vars = list(itertools.chain.from_iterable( [[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list(itertools.chain.from_iterable( [[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list(itertools.chain.from_iterable( [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes]) migrate = [(col, st) for col, st in pop_guess_vars if st=='key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Ensure no string-valued variables are being modeled as numerical. numerical_string_vars = [ var for var, stattype in pop_model_vars if stattype == 'numerical' and _column_contains_string(bdb, phrase.table, var) ] if numerical_string_vars: raise BQLError(bdb, 'Column(s) with string values modeled as numerical: %r' % (numerical_string_vars, )) # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. seen_variables = set() duplicates = set() missing = set() invalid = set() stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in seen_variables: duplicates.add(name) continue if not core.bayesdb_table_has_column(bdb, phrase.table, nm): missing.add(name) continue cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue seen_variables.add(nm) # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),)) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),)) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if stattype == 'ignore': continue core.bayesdb_add_variable(bdb, population_id, name, stattype)