Exemplo n.º 1
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(bdb, population_id,
                                                     generator_id, colno)
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(
                    bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s' %
                    (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(
                insert_column_sql, {
                    'population_id': population_id,
                    'generator_id': generator_id,
                    'colno': colno,
                    'count': count,
                    'sum': xsum,
                    'sumsq': sumsq,
                })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb,
                               'Invalid nig_normal clause: %r' % (clause, ))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                                             obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var, ))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                                                generator_id, dev_var,
                                                'numerical')
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
Exemplo n.º 2
0
    def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1,
            max_seconds=None, ckpt_iterations=None, ckpt_seconds=None,
            program=None):
        if program is not None:
            # XXX
            raise NotImplementedError('nig_normal analysis programs')

        population_id = core.bayesdb_generator_population(bdb, generator_id)
        # Ignore analysis timing control, because one step reaches the
        # posterior anyway.
        # NOTE: Does not update the model iteration count.  This would
        # manifest as failing to count the number of inference
        # iterations taken.  Since inference converges in one step,
        # this consists of failing to track the metadata of whether
        # that one step was done or not.
        update_sample_sql = '''
            UPDATE bayesdb_nig_normal_model SET mu = :mu, sigma = :sigma
                WHERE
                    population_id = :population_id
                    AND generator_id = :generator_id
                    AND colno = :colno
                    AND modelno = :modelno
        '''
        if modelnos is None:
            # This assumes that models x columns forms a dense
            # rectangle in the database, which it should.
            modelnos = self._modelnos(bdb, generator_id)
        self._set_models(bdb, population_id, generator_id, modelnos,
            update_sample_sql)
Exemplo n.º 3
0
 def column_mutual_information(self, bdb, generator_id, modelnos, colnos0,
                               colnos1, constraints, numsamples):
     population_id = bayesdb_generator_population(bdb, generator_id)
     colnames0 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos0
     ]
     colnames1 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos1
     ]
     server = self._get_preql_server(bdb, generator_id)
     target_set = server._cols_to_mask(server.encode_set(colnames0))
     query_set = server._cols_to_mask(server.encode_set(colnames1))
     if self._marginize_cmi(constraints):
         inner_numsamples = numsamples
         conditioning_rows_loom_format = self._get_constraint_rows(
             constraints, bdb, generator_id, population_id, modelnos,
             server, inner_numsamples)
     else:
         conditioning_rows_loom_format = [
             self._get_constraint_row(constraints, bdb, generator_id,
                                      population_id, server)
         ]
     mi_estimates = [
         server._query_server.mutual_information(
             target_set,
             query_set,
             entropys=None,
             sample_count=loom.preql.SAMPLE_COUNT,
             conditioning_row=conditioning_row_loom_format).mean
         for conditioning_row_loom_format in conditioning_rows_loom_format
     ]
     # Output requires an iterable.
     return [arithmetic_mean(mi_estimates)]
Exemplo n.º 4
0
 def _get_ordered_column_names(self, bdb, generator_id):
     """Return list of column names ordered by their loom rank."""
     population_id = bayesdb_generator_population(bdb, generator_id)
     return [
         bayesdb_variable_name(bdb, population_id, None, colno)
         for colno in self._get_ordered_column_numbers(bdb, generator_id)
     ]
Exemplo n.º 5
0
    def logpdf_joint(self, bdb, generator_id, modelnos, rowid, targets,
            constraints):
        population_id = bayesdb_generator_population(bdb, generator_id)
        ordered_column_names = self._get_ordered_column_names(bdb, generator_id)

        # Pr[targets|constraints] = Pr[targets, constraints] / Pr[constraints]
        # The numerator is and_case; denominator is conditional_case.
        and_case = OrderedDict(
            [(a, None) for a in ordered_column_names])
        conditional_case = OrderedDict(
            [(a, None) for a in ordered_column_names])

        for (colno, value) in targets:
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            and_case[column_name] = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)
            conditional_case[column_name] = None
        for (colno, value) in constraints:
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            processed_value = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)

            and_case[column_name] = processed_value
            conditional_case[column_name] = processed_value

        and_case = and_case.values()
        conditional_case = conditional_case.values()

        server = self._get_query_server(bdb, generator_id)
        and_score = server.score(and_case)
        conditional_score = server.score(conditional_case)
        return and_score - conditional_score
Exemplo n.º 6
0
    def predict_confidence(self, bdb, generator_id, modelnos, rowid, colno,
            numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0
            return pred, conf

        # Retrieve the samples: specifying rowid suffices to ensures that
        # relevant constraints are retrieved by simulat_joint.
        sample = self.simulate_joint(
            bdb, generator_id, modelnos, rowid, [colno], [], numsamples)

        # Determine the imputation strategy (mode or mean).
        population_id = bayesdb_generator_population(bdb, generator_id)
        stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)

        # Run the imputation.
        if _is_nominal(stattype):
            return _impute_categorical(sample)
        else:
            return _impute_numerical(sample)
Exemplo n.º 7
0
    def logpdf_joint(self, bdb, generator_id, modelnos, rowid, targets,
                     constraints):
        population_id = bayesdb_generator_population(bdb, generator_id)
        ordered_column_names = self._get_ordered_column_names(
            bdb, generator_id)

        # Pr[targets|constraints] = Pr[targets, constraints] / Pr[constraints]
        # The numerator is and_case; denominator is conditional_case.
        and_case = OrderedDict([(a, None) for a in ordered_column_names])
        conditional_case = OrderedDict([(a, None)
                                        for a in ordered_column_names])

        for (colno, value) in targets:
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            and_case[column_name] = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)
            conditional_case[column_name] = None
        for (colno, value) in constraints:
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            processed_value = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)

            and_case[column_name] = processed_value
            conditional_case[column_name] = processed_value

        and_case = and_case.values()
        conditional_case = conditional_case.values()

        server = self._get_query_server(bdb, generator_id)
        and_score = server.score(and_case)
        conditional_score = server.score(conditional_case)
        return and_score - conditional_score
Exemplo n.º 8
0
 def _get_ordered_column_names(self, bdb, generator_id):
     """Return list of column names ordered by their loom rank."""
     population_id = bayesdb_generator_population(bdb, generator_id)
     return [
         bayesdb_variable_name(bdb, population_id, None, colno)
         for colno in self._get_ordered_column_numbers(bdb, generator_id)
     ]
Exemplo n.º 9
0
    def analyze_models(self,
                       bdb,
                       generator_id,
                       modelnos=None,
                       iterations=1,
                       max_seconds=None,
                       ckpt_iterations=None,
                       ckpt_seconds=None,
                       program=None):
        if program is not None:
            # XXX
            raise NotImplementedError('nig_normal analysis programs')

        population_id = core.bayesdb_generator_population(bdb, generator_id)
        # Ignore analysis timing control, because one step reaches the
        # posterior anyway.
        # NOTE: Does not update the model iteration count.  This would
        # manifest as failing to count the number of inference
        # iterations taken.  Since inference converges in one step,
        # this consists of failing to track the metadata of whether
        # that one step was done or not.
        update_sample_sql = '''
            UPDATE bayesdb_nig_normal_model SET mu = :mu, sigma = :sigma
                WHERE
                    population_id = :population_id
                    AND generator_id = :generator_id
                    AND colno = :colno
                    AND modelno = :modelno
        '''
        if modelnos is None:
            # This assumes that models x columns forms a dense
            # rectangle in the database, which it should.
            modelnos = self._modelnos(bdb, generator_id)
        self._set_models(bdb, population_id, generator_id, modelnos,
                         update_sample_sql)
Exemplo n.º 10
0
    def predict_confidence(self,
                           bdb,
                           generator_id,
                           modelnos,
                           rowid,
                           colno,
                           numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0
            return pred, conf

        # Retrieve the samples. Specifying `rowid` ensures that relevant
        # constraints are retrieved by `simulate`,
        # so provide empty constraints.
        sample = self.simulate_joint(bdb, generator_id, modelnos, rowid,
                                     [colno], [], numsamples)

        # Determine the imputation strategy (mode or mean).
        population_id = bayesdb_generator_population(bdb, generator_id)
        stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
        if _is_nominal(stattype):
            return _impute_categorical(sample)
        return _impute_numerical(sample)
Exemplo n.º 11
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(
                bdb, population_id, generator_id, colno)
            stattype = core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s'
                    % (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(insert_column_sql, {
                'population_id': population_id,
                'generator_id': generator_id,
                'colno': colno,
                'count': count,
                'sum': xsum,
                'sumsq': sumsq,
            })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb, 'Invalid nig_normal clause: %r' %
                    (clause,))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                    obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var,))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                generator_id, dev_var, 'numerical')
            bdb.sql_execute('''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
Exemplo n.º 12
0
 def initialize_models(self, bdb, generator_id, modelnos):
     population_id = core.bayesdb_generator_population(bdb, generator_id)
     insert_sample_sql = '''
         INSERT INTO bayesdb_nig_normal_model
             (population_id, generator_id, colno, modelno, mu, sigma)
             VALUES (:population_id, :generator_id, :colno, :modelno,
                 :mu, :sigma)
     '''
     self._set_models(bdb, population_id, generator_id, modelnos,
                      insert_sample_sql)
Exemplo n.º 13
0
 def initialize_models(self, bdb, generator_id, modelnos):
     population_id = core.bayesdb_generator_population(bdb, generator_id)
     insert_sample_sql = '''
         INSERT INTO bayesdb_nig_normal_model
             (population_id, generator_id, colno, modelno, mu, sigma)
             VALUES (:population_id, :generator_id, :colno, :modelno,
                 :mu, :sigma)
     '''
     self._set_models(bdb, population_id, generator_id, modelnos,
         insert_sample_sql)
Exemplo n.º 14
0
 def _convert_to_proper_stattype(self, bdb, generator_id, colno, value):
     """Convert a value returned by the logpdf_joint method parameters into a
     form that Loom can handle. For instance, convert from an integer to
     real or, from a string to an integer.
     """
     if value is None:
         return value
     population_id = bayesdb_generator_population(bdb, generator_id)
     stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
     # If nominal then return the integer code.
     if _is_nominal(stattype):
         return self._get_integer_form(bdb, generator_id, colno, value)
     # Return the value as float.
     return float(value)
Exemplo n.º 15
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item, ) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path,
                          rows_csv=csv_file.name,
                          schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt, )).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid, ) in enumerate(rowids))
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions, ))
Exemplo n.º 16
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item,) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path, rows_csv=csv_file.name,
            schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt,)).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid,) in enumerate(rowids)
        )
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions,))
Exemplo n.º 17
0
    def _data(self, bdb, generator_id, vars):
        # Get the column numbers and statistical types.
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        colnos = [
            core.bayesdb_variable_number(bdb, population_id, generator_id, var)
            for var in vars
        ]
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno)
            for colno in colnos
        ]

        # Get the table name, quoted for constructing SQL.
        table_name = core.bayesdb_generator_table(bdb, generator_id)
        qt = sqlite3_quote_name(table_name)

        # Create SQL expressions to cast each variable to the correct
        # affinity for its statistical type.
        def cast(var, colno, stattype):
            if colno < 0:
                return 'NULL'
            qv = sqlite3_quote_name(var)
            affinity = core.bayesdb_stattype_affinity(bdb, stattype)
            qa = sqlite3_quote_name(affinity)
            return 'CAST(t.%s AS %s)' % (qv, qa)

        qexpressions = ','.join(map(cast, vars, colnos, stattypes))

        # Get a cursor.
        cursor = bdb.sql_execute(
            '''
            SELECT %s FROM %s AS t, bayesdb_cgpm_individual AS ci
                WHERE ci.generator_id = ?
                    AND ci.table_rowid = t._rowid_
            ORDER BY t._rowid_ ASC
        ''' % (qexpressions, qt), (generator_id, ))

        # Map values to codes.
        def map_value(colno, value):
            return self._to_numeric(bdb, generator_id, colno, value)

        return [
            tuple(map_value(colno, x) for colno, x in zip(colnos, row))
            for row in cursor
        ]
Exemplo n.º 18
0
    def _store_encoding_info(self, bdb, generator_id):
        encoding_path = os.path.join(
            self._get_loom_project_path(bdb, generator_id), 'ingest',
            'encoding.json.gz')
        with gzip.open(encoding_path) as encoding_file:
            encoding = json.loads(encoding_file.read().decode('ascii'))

        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store string encoding.
        insert_string_encoding = '''
            INSERT INTO bayesdb_loom_string_encoding
            (generator_id, colno, string_form, integer_form)
            VALUES (:generator_id, :colno, :string_form, :integer_form)
        '''
        for col in encoding:
            if 'symbols' in col:
                colno = bayesdb_table_column_number(bdb, table,
                                                    str(col['name']))
                for string_form, integer_form in col['symbols'].iteritems():
                    bdb.sql_execute(
                        insert_string_encoding, {
                            'generator_id': generator_id,
                            'colno': colno,
                            'string_form': string_form,
                            'integer_form': integer_form
                        })

        # Store ordering of columns.
        insert_order_sql = '''
            INSERT INTO bayesdb_loom_column_ordering
            (generator_id, colno, rank)
            VALUES (:generator_id, :colno, :rank)
        '''
        for col_index in xrange(len(encoding)):
            colno = bayesdb_table_column_number(
                bdb, table, str(encoding[col_index]['name']))
            bdb.sql_execute(insert_order_sql, {
                'generator_id': generator_id,
                'colno': colno,
                'rank': col_index
            })
Exemplo n.º 19
0
    def _store_encoding_info(self, bdb, generator_id):
        encoding_path = os.path.join(
            self._get_loom_project_path(bdb, generator_id),
            'ingest', 'encoding.json.gz'
        )
        with gzip.open(encoding_path) as encoding_file:
            encoding = json.loads(encoding_file.read().decode('ascii'))

        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store string encoding.
        insert_string_encoding = '''
            INSERT INTO bayesdb_loom_string_encoding
            (generator_id, colno, string_form, integer_form)
            VALUES (:generator_id, :colno, :string_form, :integer_form)
        '''
        for col in encoding:
            if 'symbols' in col:
                colno = bayesdb_table_column_number(bdb, table, str(col['name']))
                for string_form, integer_form in col['symbols'].iteritems():
                    bdb.sql_execute(insert_string_encoding, {
                        'generator_id': generator_id,
                        'colno': colno,
                        'string_form': string_form,
                        'integer_form': integer_form
                    })

        # Store ordering of columns.
        insert_order_sql = '''
            INSERT INTO bayesdb_loom_column_ordering
            (generator_id, colno, rank)
            VALUES (:generator_id, :colno, :rank)
        '''
        for col_index in xrange(len(encoding)):
            colno = bayesdb_table_column_number(
                bdb, table, str(encoding[col_index]['name']))
            bdb.sql_execute(insert_order_sql, {
                'generator_id': generator_id,
                'colno': colno,
                'rank': col_index
            })
Exemplo n.º 20
0
 def _store_kind_partition(self, bdb, generator_id, modelnos):
     population_id = bayesdb_generator_population(bdb, generator_id)
     if modelnos is None:
         modelnos = range(self._get_num_models(bdb, generator_id))
     with bdb.savepoint():
         for modelno in modelnos:
             column_partition = self._retrieve_column_partition(
                 bdb, generator_id, modelno)
             # Bulk insertion of mapping from colno to kind_id.
             colnos = bayesdb_variable_numbers(bdb, population_id, None)
             ranks = [self._get_loom_rank(bdb, generator_id, colno)
                 for colno in colnos]
             insertions = ','.join(
                 str((generator_id, modelno, colno, column_partition[rank]))
                 for colno, rank in zip(colnos, ranks)
             )
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO bayesdb_loom_column_kind_partition
                 (generator_id, modelno, colno, kind_id)
                 VALUES %s
             ''' % (insertions,))
             # Bulk insertion of mapping from (kind_id, rowid) to cluster_id.
             row_partition = self._retrieve_row_partition(
                 bdb, generator_id, modelno)
             rowids = bdb.sql_execute('''
                 SELECT table_rowid, loom_rowid
                     FROM bayesdb_loom_rowid_mapping
             ''').fetchall()
             insertions = ','.join(
                 str((generator_id, modelno, rowid[0], rowid[1],
                         kind_id, partition_id))
                 for kind_id in row_partition
                 for rowid, partition_id
                     in zip(rowids, row_partition[kind_id]))
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO
                     bayesdb_loom_row_kind_partition
                 (generator_id, modelno, table_rowid, loom_rowid,
                     kind_id, partition_id)
                 VALUES %s
             ''' % (insertions,))
Exemplo n.º 21
0
 def _store_kind_partition(self, bdb, generator_id, modelnos):
     population_id = bayesdb_generator_population(bdb, generator_id)
     if modelnos is None:
         modelnos = range(self._get_num_models(bdb, generator_id))
     with bdb.savepoint():
         for modelno in modelnos:
             column_partition = self._retrieve_column_partition(
                 bdb, generator_id, modelno)
             # Bulk insertion of mapping from colno to kind_id.
             colnos = bayesdb_variable_numbers(bdb, population_id, None)
             ranks = [
                 self._get_loom_rank(bdb, generator_id, colno)
                 for colno in colnos
             ]
             insertions = ','.join(
                 str((generator_id, modelno, colno, column_partition[rank]))
                 for colno, rank in zip(colnos, ranks))
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO bayesdb_loom_column_kind_partition
                 (generator_id, modelno, colno, kind_id)
                 VALUES %s
             ''' % (insertions, ))
             # Bulk insertion of mapping from (kind_id, rowid) to cluster_id.
             row_partition = self._retrieve_row_partition(
                 bdb, generator_id, modelno)
             rowids = bdb.sql_execute('''
                 SELECT table_rowid, loom_rowid
                     FROM bayesdb_loom_rowid_mapping
             ''').fetchall()
             insertions = ','.join(
                 str((generator_id, modelno, rowid[0], rowid[1], kind_id,
                      partition_id)) for kind_id in row_partition for rowid,
                 partition_id in zip(rowids, row_partition[kind_id]))
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO
                     bayesdb_loom_row_kind_partition
                 (generator_id, modelno, table_rowid, loom_rowid,
                     kind_id, partition_id)
                 VALUES %s
             ''' % (insertions, ))
Exemplo n.º 22
0
 def _convert_to_proper_stattype(self, bdb, generator_id, colno, value):
     """Convert a value returned by the logpdf_joint method parameters into a
     form that Loom can handle. For instance, convert from an integer to
     real or, from a string to an integer.
     """
     if value is None:
         return value
     population_id = bayesdb_generator_population(bdb, generator_id)
     stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
     # If nominal, then return the integer code.
     if _is_nominal(stattype):
         return self._get_integer_form(bdb, generator_id, colno, value)
     # If countable, then return value as integer.
     elif _is_countable(stattype):
         # XXX This is going to cause a counts of 2.4 to evaluate to 2.
         # Better than having a StopIteration error coming from Loom.
         return int(value)
     # If continuous, return the value as float.
     elif _is_continuous(stattype):
         return float(value)
     else:
         assert False, 'Unknown stattype'
Exemplo n.º 23
0
    def _reorder_row(self, bdb, generator_id, row, dense=True):
        """Reorder a row of columns according to loom's column order.

        Row should be a list of (colno, value) tuples

        Returns a list of (colno, value) tuples in the proper order.
        """
        ordered_column_labels = self._get_ordered_column_labels(
            bdb, generator_id)
        ordererd_column_dict = OrderedDict([(a, None)
                                            for a in ordered_column_labels])

        population_id = bayesdb_generator_population(bdb, generator_id)
        for colno, value in zip(range(1, len(row) + 1), row):
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            ordererd_column_dict[column_name] = str(value)
        if dense is False:
            return [(colno, value)
                    for (colno, value) in ordererd_column_dict.iteritems()
                    if value is not None]
        return ordererd_column_dict.iteritems()
Exemplo n.º 24
0
    def _initialize_engine(self, bdb, generator_id, n, variables):
        population_id = core.bayesdb_generator_population(bdb, generator_id)

        def map_var(var):
            return core.bayesdb_variable_number(bdb, population_id,
                                                generator_id, var)

        # If no variables in the population modeled by the gpmcc, then create 1
        # dummy variable with one measurement. The design space for how to
        # refactor cgpm.crosscat.State to initialize without any variables is
        # not simple, so we will live with this workaround for now.
        if not variables:
            (outputs, cctypes, distargs, gpmcc_data) = \
                [7**10], ['bernoulli'], [None], [[0]]
        else:
            outputs = [map_var(var) for var, _st, _cct, _da in variables]
            cctypes = [cctype for _n, _st, cctype, _da in variables]
            distargs = [distargs for _n, _st, _cct, distargs in variables]
            gpmcc_vars = [var for var, _stattype, _dist, _params in variables]
            gpmcc_data = self._data(bdb, generator_id, gpmcc_vars)
            # If gpmcc_data has any column which is all null, then crash early
            # and notify the user of all offending column names.
            n_rows = len(gpmcc_data[0])
            nulls = [
                v for i, v in enumerate(gpmcc_vars) if all(
                    math.isnan(gpmcc_data[r][i]) for r in xrange(n_rows))
            ]
            if nulls:
                raise BQLError(
                    bdb, 'Failed to initialize, '
                    'columns have all null values: %s' % repr(nulls))

        return Engine(gpmcc_data,
                      num_states=n,
                      rng=bdb.np_prng,
                      multiprocess=self._ncpu,
                      outputs=outputs,
                      cctypes=cctypes,
                      distargs=distargs)
Exemplo n.º 25
0
 def _convert_to_proper_stattype(self, bdb, generator_id, colno, value):
     """Convert a value returned by the logpdf_joint method parameters into a
     form that Loom can handle. For instance, convert from an integer to
     real or, from a string to an integer.
     """
     if value is None:
         return value
     population_id = bayesdb_generator_population(bdb, generator_id)
     stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
     # If nominal, then return the integer code.
     if _is_nominal(stattype):
         return self._get_integer_form(bdb, generator_id, colno, value)
     # If countable, then return value as integer.
     elif _is_countable(stattype):
         # XXX This is going to cause a counts of 2.4 to evaluate to 2.
         # Better than having a StopIteration error coming from Loom.
         return int(value)
     # If continuous, return the value as float.
     elif _is_continuous(stattype):
         return float(value)
     else:
         assert False, 'Unknown stattype'
Exemplo n.º 26
0
 def column_mutual_information(self, bdb, generator_id, modelnos, colnos0,
                               colnos1, constraints, numsamples):
     # XXX Why are the constraints being ignored? If Loom does not support
     # conditioning, then implement constraints using the simple Monte Carlo
     # estimator.
     population_id = bayesdb_generator_population(bdb, generator_id)
     colnames0 = [
         str(bayesdb_variable_name(bdb, population_id, None, colno))
         for colno in colnos0
     ]
     colnames1 = [
         str(bayesdb_variable_name(bdb, population_id, None, colno))
         for colno in colnos1
     ]
     server = self._get_cache_entry(bdb, generator_id, 'preql_server')
     target_set = server._cols_to_mask(server.encode_set(colnames0))
     query_set = server._cols_to_mask(server.encode_set(colnames1))
     mi = server._query_server.mutual_information(
         target_set,
         query_set,
         entropys=None,
         sample_count=loom.preql.SAMPLE_COUNT)
     return mi
Exemplo n.º 27
0
    def _initialize_cgpm(self, bdb, generator_id, cgpm_ext):
        population_id = core.bayesdb_generator_population(bdb, generator_id)

        def map_var(var):
            return core.bayesdb_variable_number(bdb, population_id,
                                                generator_id, var)

        name = cgpm_ext['name']
        outputs = map(map_var, cgpm_ext['outputs'])
        inputs = map(map_var, cgpm_ext['inputs'])
        args = cgpm_ext.get('args', ())
        kwds = cgpm_ext.get('kwds', {})
        if name not in self._cgpm_registry:
            raise BQLError(bdb, 'Unknown CGPM: %s' % (repr(name), ))
        cls = self._cgpm_registry[name]
        cgpm_vars = cgpm_ext['outputs'] + cgpm_ext['inputs']
        cgpm_data = self._data(bdb, generator_id, cgpm_vars)
        cgpm = cls(outputs, inputs, rng=bdb.np_prng, *args, **kwds)
        for cgpm_rowid, row in enumerate(cgpm_data):
            # CGPMs do not uniformly handle null values or missing
            # values sensibly yet, so until we have that sorted
            # out we both (a) omit nulls and (b) ignore errors in
            # incorporate.
            query = {
                colno: row[i]
                for i, colno in enumerate(outputs) if not math.isnan(row[i])
            }
            n = len(outputs)
            evidence = {
                colno: row[n + i]
                for i, colno in enumerate(inputs) if not math.isnan(row[n + i])
            }
            try:
                cgpm.incorporate(cgpm_rowid, query, evidence)
            except Exception:
                pass
        return cgpm
Exemplo n.º 28
0
 def column_mutual_information(self, bdb, generator_id, modelnos, colnos0,
         colnos1, constraints, numsamples):
     population_id = bayesdb_generator_population(bdb, generator_id)
     colnames0 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos0
     ]
     colnames1 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos1
     ]
     server = self._get_preql_server(bdb, generator_id)
     target_set = server._cols_to_mask(server.encode_set(colnames0))
     query_set = server._cols_to_mask(server.encode_set(colnames1))
     if self._marginize_cmi(constraints):
         inner_numsamples = numsamples
         conditioning_rows_loom_format = self._get_constraint_rows(
             constraints, bdb, generator_id, population_id, modelnos, server,
             inner_numsamples)
     else:
         conditioning_rows_loom_format = [
             self._get_constraint_row(constraints, bdb, generator_id,
             population_id, server)
         ]
     mi_estimates = [
         server._query_server.mutual_information(
             target_set,
             query_set,
             entropys=None,
             sample_count=loom.preql.SAMPLE_COUNT,
             conditioning_row=conditioning_row_loom_format
         ).mean
         for conditioning_row_loom_format in conditioning_rows_loom_format
     ]
     # Output requires an iterable.
     return [arithmetic_mean(mi_estimates)]
Exemplo n.º 29
0
    def predict_confidence(self,
                           bdb,
                           generator_id,
                           modelno,
                           colno,
                           rowid,
                           numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0  # XXX Punt confidence for now
            return pred, conf

        constraints = []
        # If rowid is a hypothetical cell for cgpm (did not exist at the time
        # of INITIALIZE), but exists in the base table (by INSERT INTO), then
        # retrieve all values for rowid as the constraints.
        exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id)
        max_cgpm_rowid = bdb.sql_execute(
            '''
            SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, )).fetchall()[0][0]
        hypothetical = rowid > max_cgpm_rowid
        if exists and hypothetical:
            population_id = core.bayesdb_generator_population(
                bdb, generator_id)
            # Retrieve all other variables except colno, and ignore latents in
            # generator_id, and place them in the constraints.
            pop_names = core.bayesdb_variable_names(bdb, population_id, None)
            avoid_name = core.bayesdb_variable_name(bdb, population_id, colno)
            constraints_names = [n for n in pop_names if n != avoid_name]
            # Obtain the row.
            qt_names = str.join(',', map(sqlite3_quote_name,
                                         constraints_names))
            qt_table = sqlite3_quote_name(
                core.bayesdb_population_table(bdb, population_id))
            data = bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid = ?
            ''' % (
                    qt_names,
                    qt_table,
                ), (rowid, )).fetchall()[0]
            # Build the constraints.
            pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None)
            constraints_nos = [n for n in pop_nos if n != colno]
            # import ipdb; ipdb.set_trace()
            assert len(data) == len(constraints_nos)
            constraints = [(rowid, c, v)
                           for c, v in zip(constraints_nos, data)
                           if (v is not None) and v]

        # Retrieve the samples.
        sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)],
                                     constraints, modelno, numsamples)

        # Determine the imputation strategy (mode or mean).
        stattype = core.bayesdb_variable_stattype(
            bdb, core.bayesdb_generator_population(bdb, generator_id), colno)
        if _is_categorical(stattype):
            return _impute_categorical(sample)
        else:
            return _impute_numerical(sample)
Exemplo n.º 30
0
    def analyze_models(self,
                       bdb,
                       generator_id,
                       modelnos=None,
                       iterations=1,
                       max_seconds=None,
                       ckpt_iterations=None,
                       ckpt_seconds=None,
                       program=None):
        assert modelnos is None

        if ckpt_iterations is not None or ckpt_seconds is not None:
            # XXX
            raise NotImplementedError(
                'CGpm analysis checkpoint not supported.')

        if program is None:
            program = []

        population_id = core.bayesdb_generator_population(bdb, generator_id)

        def retrieve_analyze_variables(ast):
            # Transition all variables by default.
            variables = None

            # Exactly 1 VARIABLES or SKIP clause supported for simplicity.
            seen_variables, seen_skip, seen_optimized = False, False, False
            for clause in ast:
                # Transition user specified variables only.
                if isinstance(clause, cgpm_analyze.parse.Variables):
                    if seen_variables or seen_skip:
                        raise BQLError(
                            bdb,
                            'Only 1 VARIABLES or SKIP clause allowed in ANALYZE'
                        )
                    seen_variables = True
                    included = set()
                    unknown = set()
                    for var in clause.vars:
                        if not core.bayesdb_has_variable(
                                bdb, population_id, generator_id, var):
                            unknown.add(var)
                        included.add(var)
                    if unknown:
                        raise BQLError(
                            bdb, 'Unknown variables in ANALYZE: %r' %
                            (sorted(unknown), ))
                    variables = sorted(included)
                # Transition all variables except user specified skip.
                elif isinstance(clause, cgpm_analyze.parse.Skip):
                    if seen_variables or seen_skip:
                        raise BQLError(
                            bdb,
                            'Only 1 VARIABLES or SKIP clause allowed in ANALYZE'
                        )
                    seen_skip = True
                    excluded = set()
                    unknown = set()
                    for var in clause.vars:
                        if not core.bayesdb_has_variable(
                                bdb, population_id, generator_id, var):
                            unknown.add(var)
                        excluded.add(var)
                    if unknown:
                        raise BQLError(
                            bdb, 'Unknown variables in ANALYZE: %r' %
                            (sorted(unknown), ))
                    all_vars = core.bayesdb_variable_names(
                        bdb, population_id, generator_id)
                    variables = sorted(set(all_vars) - excluded)
                elif isinstance(clause, cgpm_analyze.parse.Optimized):
                    seen_optimized = True
                # Unknown/impossible clause.
                else:
                    raise ValueError('Unknown clause in ANALYZE: %s.' % ast)

            if variables is None:
                variables = core.bayesdb_variable_names(
                    bdb, population_id, generator_id)

            varnos = [
                core.bayesdb_variable_number(bdb, population_id, generator_id,
                                             v) for v in variables
            ]

            # TODO Perform error checking if the OPTIMIZED clause is used.
            # In particular, the variables in OPTIMIZED must correspond
            # EXACTLY to the variables that are modeled by the CrossCat
            # baseline. Avoided this check for now since the nature of a
            # variable is not stored in the bdb. For now, just check the
            # user did not include a VARIABLES clause.
            if seen_optimized:
                if seen_variables:
                    raise BQLError(bdb,
                                   'OPTIMIZED incompatible with VARIABLES')
                # TODO Check if varnos are exactly the CrossCat variables.
                # raise BQLError(bdb,
                #     'The OPTIMIZED phrase in ANALYZE must target all the '
                #     'variables modeled by the baseline, only. '
                #     'Use SKIP to explicitly ignore analysis of overriden '
                #     'variables')

            return varnos, seen_optimized

        # Retrieve target variables and whether optimized.
        analyze_ast = cgpm_analyze.parse.parse(program)
        varnos, optimized = retrieve_analyze_variables(analyze_ast)

        engine = self._engine(bdb, generator_id)
        if optimized:
            engine.transition_lovecat(N=iterations,
                                      S=max_seconds,
                                      multiprocess=self._ncpu)
        else:
            engine.transition(N=iterations,
                              S=max_seconds,
                              cols=varnos,
                              multiprocess=self._ncpu)

        # Serialize the engine.
        engine_json = json_dumps(engine.to_metadata())

        # Update the engine.
        bdb.sql_execute(
            '''
            UPDATE bayesdb_cgpm_generator
                SET engine_json = :engine_json
                WHERE generator_id = :generator_id
        ''', {
                'generator_id': generator_id,
                'engine_json': engine_json
            })
Exemplo n.º 31
0
def _retrieve_analyze_variables(bdb, generator_id, ast):

    population_id = core.bayesdb_generator_population(bdb, generator_id)

    # Transitions all variables by default.
    variables = None

    # Exactly 1 VARIABLES or SKIP clause supported for simplicity.
    seen_variables, seen_skip, seen_optimized = False, False, False

    for clause in ast:

        # Transition user specified variables only.
        if isinstance(clause, cgpm_analyze.parse.Variables):
            if seen_variables or seen_skip:
                raise BQLError(
                    bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE')
            seen_variables = True
            included = set()
            unknown = set()
            for var in clause.vars:
                if not core.bayesdb_has_variable(bdb, population_id,
                                                 generator_id, var):
                    unknown.add(var)
                included.add(var)
            if unknown:
                raise BQLError(
                    bdb,
                    'Unknown variables in ANALYZE: %r' % (sorted(unknown), ))
            variables = sorted(included)

        # Transition all variables except user specified skip.
        elif isinstance(clause, cgpm_analyze.parse.Skip):
            if seen_variables or seen_skip:
                raise BQLError(
                    bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE')
            seen_skip = True
            excluded = set()
            unknown = set()
            for var in clause.vars:
                if not core.bayesdb_has_variable(bdb, population_id,
                                                 generator_id, var):
                    unknown.add(var)
                excluded.add(var)
            if unknown:
                raise BQLError(
                    bdb,
                    'Unknown variables in ANALYZE: %r' % (sorted(unknown), ))
            all_vars = core.bayesdb_variable_names(bdb, population_id,
                                                   generator_id)
            variables = sorted(set(all_vars) - excluded)

        # OPTIMIZED is incompatible with any other clause.
        elif isinstance(clause, cgpm_analyze.parse.Optimized):
            seen_optimized = True

        # Unknown/impossible clause.
        else:
            raise BQLError(bdb, 'Unknown clause in ANALYZE: %s.' % (ast, ))

    # OPTIMIZED is incompatible with any other clause.
    if seen_optimized:
        if seen_variables or seen_skip:
            raise BQLError(bdb, 'OPTIMIZED incompatible with other clauses.')

    variable_numbers = [
        core.bayesdb_variable_number(bdb, population_id, generator_id, v)
        for v in variables
    ] if variables else None

    return (variable_numbers, seen_optimized)
Exemplo n.º 32
0
    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [(colno, rowval)
                            for colno, rowval in zip(colnos, rowvals)
                            if rowval is not None and colno not in targets]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(
                    bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno]: '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno): value
            for colno, value in constraints_full
        }
        row = dict(
            itertools.chain(row_targets.iteritems(),
                            row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [[_extract_simulated_value(row, colno) for colno in targets]
                for row in simulated_rows]
Exemplo n.º 33
0
def _create_schema(bdb, generator_id, schema_ast, **kwargs):
    # Get some parameters.
    population_id = core.bayesdb_generator_population(bdb, generator_id)
    table = core.bayesdb_population_table(bdb, population_id)

    # State.
    variables = []
    variable_dist = {}
    latents = {}
    cgpm_composition = []
    modelled = set()
    default_modelled = set()
    subsample = None
    deferred_input = defaultdict(lambda: [])
    deferred_output = dict()

    # Error-reporting state.
    duplicate = set()
    unknown = set()
    needed = set()
    existing_latent = set()
    must_exist = []
    unknown_stattype = {}

    # XXX Convert all Foreign.exposed lists to Latent clauses.
    # Retrieve Foreign clauses with exposed variables.
    foreign_clauses = [
        c for c in schema_ast
        if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0
    ]
    # Add the exposed variables to Foreign.outputs
    # Note that this assumes if there are K exposed variables, then they are
    # necessarily the last K outputs of the fc.outputs.
    for fc in foreign_clauses:
        fc.outputs.extend([e[0] for e in fc.exposed])

    # Convert exposed entries into Latent clauses.
    latent_vars = list(
        itertools.chain.from_iterable(c.exposed for c in foreign_clauses))
    latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars]
    # Append the Latent clauses to the ast.
    schema_ast.extend(latent_clauses)

    # XXX Convert the baseline to a Foreign clause.
    # Currently the baselines do not accept a schema, and will fail if
    # `schema_ast` has any entries.
    baseline = kwargs.get('baseline', None)
    if baseline is not None and casefold(baseline.name) != 'crosscat':
        if schema_ast:
            raise BQLError(
                bdb, 'Cannot accept schema with baseline: %s.' % schema_ast)
        # Retrieve all variable names in the population
        outputs = core.bayesdb_variable_names(bdb, population_id, None)
        # Convert the LITERAL namedtuples to their raw values.
        ps, vs = zip(*baseline.params)
        vs_new = [v.value for v in vs]
        params = zip(ps, vs_new)
        # Create the clause.
        clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name,
                                           params)
        # And add append it to the schema_ast.
        schema_ast.append(clause)

    # Process each clause one by one.
    for clause in schema_ast:

        if isinstance(clause, cgpm_schema.parse.Basic):
            # Basic Crosscat component model: one variable to be put
            # into Crosscat views.
            var = clause.var
            dist = clause.dist
            params = dict(clause.params)  # XXX error checking

            # Reject if the variable does not exist.
            if not core.bayesdb_has_variable(bdb, population_id, None, var):
                unknown.add(var)
                continue

            # Reject if the variable has already been modelled.
            if var in modelled:
                duplicate.add(var)
                continue

            # Reject if the variable is latent.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Get the column number.
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            assert 0 <= colno

            # Add it to the list and mark it modelled by default.
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      colno)
            variables.append([var, stattype, dist, params])
            assert var not in variable_dist
            variable_dist[var] = (stattype, dist, params)
            modelled.add(var)
            default_modelled.add(var)

        elif isinstance(clause, cgpm_schema.parse.Latent):
            var = clause.name
            stattype = clause.stattype

            # Reject if the variable has already been modelled by the
            # default model.
            if var in default_modelled:
                duplicate.add(var)
                continue

            # Reject if the variable even *exists* in the population
            # at all yet.
            if core.bayesdb_has_variable(bdb, population_id, None, var):
                duplicate.add(var)
                continue

            # Reject if the variable is already latent, from another
            # generator.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Reject if we've already processed it.
            if var in latents:
                duplicate.add(var)
                continue

            # Add it to the set of latent variables.
            latents[var] = stattype

        elif isinstance(clause, cgpm_schema.parse.Foreign):
            # Foreign model: some set of output variables is to be
            # modelled by foreign logic, possibly conditional on some
            # set of input variables.
            #
            # Gather up the state for a cgpm_composition record, which
            # we may have to do incrementally because it must refer to
            # the distribution types of variables we may not have
            # seen.
            name = clause.name
            outputs = clause.outputs
            inputs = clause.inputs

            output_stattypes = []
            output_statargs = []
            input_stattypes = []
            input_statargs = []
            distargs = {
                'inputs': {
                    'stattypes': input_stattypes,
                    'statargs': input_statargs
                },
                'outputs': {
                    'stattypes': output_stattypes,
                    'statargs': output_statargs,
                }
            }
            kwds = {'distargs': distargs}
            kwds.update(clause.params)

            # First make sure all the output variables exist and have
            # not yet been modelled.
            for var in outputs:
                must_exist.append(var)
                if var in modelled:
                    duplicate.add(var)
                    continue
                modelled.add(var)
                # Add the output statistical type and its parameters.
                i = len(output_stattypes)
                assert i == len(output_statargs)
                output_stattypes.append(None)
                output_statargs.append(None)
                deferred_output[var] = (output_stattypes, output_statargs, i)

            # Next make sure all the input variables exist, mark them
            # needed, and record where to put their distribution type
            # and parameters.
            for var in inputs:
                must_exist.append(var)
                needed.add(var)
                i = len(input_stattypes)
                assert i == len(input_statargs)
                input_stattypes.append(None)
                input_statargs.append(None)
                deferred_input[var].append(
                    (input_stattypes, input_statargs, i))

            # Finally, add a cgpm_composition record.
            cgpm_composition.append({
                'name': name,
                'inputs': inputs,
                'outputs': outputs,
                'kwds': kwds,
            })

        elif isinstance(clause, cgpm_schema.parse.Subsample):
            if subsample is not None:
                raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, ))
            subsample = clause.n

        else:
            raise BQLError(bdb, 'Unknown clause: %r' % (clause, ))

    # Make sure all the outputs and inputs exist, either in the
    # population or as latents in this generator.
    for var in must_exist:
        if core.bayesdb_has_variable(bdb, population_id, None, var):
            continue
        if var in latents:
            continue
        unknown.add(var)

    # Raise an exception if there were duplicates or unknown
    # variables.
    if duplicate:
        raise BQLError(bdb,
                       'Duplicate model variables: %r' % (sorted(duplicate), ))
    if existing_latent:
        raise BQLError(
            bdb, 'Latent variables already defined: %r' %
            (sorted(existing_latent), ))
    if unknown:
        raise BQLError(bdb,
                       'Unknown model variables: %r' % (sorted(unknown), ))

    def default_dist(var, stattype):
        stattype = casefold(stattype)
        if stattype not in _DEFAULT_DIST:
            if var in unknown_stattype:
                assert unknown_stattype[var] == stattype
            else:
                unknown_stattype[var] = stattype
            return None
        dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var)
        return dist, params

    # Use the default distribution for any variables that remain to be
    # modelled, excluding any that are latent or that have statistical
    # types we don't know about.
    for var in core.bayesdb_variable_names(bdb, population_id, None):
        if var in modelled:
            continue
        colno = core.bayesdb_variable_number(bdb, population_id, None, var)
        assert 0 <= colno
        stattype = core.bayesdb_variable_stattype(bdb, population_id, colno)
        distparams = default_dist(var, stattype)
        if distparams is None:
            continue
        dist, params = distparams
        variables.append([var, stattype, dist, params])
        assert var not in variable_dist
        variable_dist[var] = (stattype, dist, params)
        modelled.add(var)

    # Fill in the deferred_input statistical type assignments.
    for var in sorted(deferred_input.iterkeys()):
        # Check whether the variable is modelled.  If not, skip -- we
        # will fail later because this variable is guaranteed to also
        # be in needed.
        if var not in modelled:
            assert var in needed
            continue

        # Determine (possibly fictitious) distribution and parameters.
        if var in default_modelled:
            # Manifest variable modelled by default Crosscat model.
            assert var in variable_dist
            stattype, dist, params = variable_dist[var]
        else:
            # Modelled by a foreign model.  Assign a fictitious
            # default distribution because the 27B/6 of CGPM requires
            # this.
            if var in latents:
                # Latent variable modelled by a foreign model.  Use
                # the statistical type specified for it.
                stattype = latents[var]
            else:
                # Manifest variable modelled by a foreign model.  Use
                # the statistical type in the population.
                assert core.bayesdb_has_variable(bdb, population_id, None, var)
                colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     var)
                stattype = core.bayesdb_variable_stattype(
                    bdb, population_id, colno)
            distparams = default_dist(var, stattype)
            if distparams is None:
                continue
            dist, params = distparams

        # Assign the distribution and parameters.
        for cctypes, ccargs, i in deferred_input[var]:
            assert cctypes[i] is None
            assert ccargs[i] is None
            cctypes[i] = dist
            ccargs[i] = params

    # Fill in the deferred_output statistical type assignments. The need to be
    # in the form NUMERICAL or CATEGORICAL.
    for var in deferred_output:
        if var in latents:
            # Latent variable modelled by a foreign model.  Use
            # the statistical type specified for it.
            var_stattype = casefold(latents[var])
            if var_stattype not in _DEFAULT_DIST:
                if var in unknown_stattype:
                    assert unknown_stattype[var] == var_stattype
                else:
                    unknown_stattype[var] = var_stattype
            # XXX Cannot specify statargs for a latent variable. Trying to using
            # default_dist might lookup the counts for unique values of the
            # categorical in the base table causing a failure.
            var_statargs = {}
        else:
            # Manifest variable modelled by a foreign model.  Use
            # the statistical type and arguments from the population.
            assert core.bayesdb_has_variable(bdb, population_id, None, var)
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            var_stattype = core.bayesdb_variable_stattype(
                bdb, population_id, colno)
            distparams = default_dist(var, var_stattype)
            if distparams is None:
                continue
            _, var_statargs = distparams

        stattypes, statargs, i = deferred_output[var]
        assert stattypes[i] is None
        assert statargs[i] is None
        stattypes[i] = var_stattype
        statargs[i] = var_statargs

    if unknown_stattype:
        raise BQLError(
            bdb, 'Unknown statistical types for variables: %r' %
            (sorted(unknown_stattype.iteritems(), )))

    # If there remain any variables that we needed to model, because
    # others are conditional on them, fail.
    needed -= modelled
    if needed:
        raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, ))

    # Finally, create a CGPM schema.
    return {
        'variables': variables,
        'cgpm_composition': cgpm_composition,
        'subsample': subsample,
        'latents': latents,
    }
Exemplo n.º 34
0
    def _cmd_render_crosscat(self, query, sql=None, **kwargs):
        '''Returns a rendering of the specified crosscat state

        Usage: .render_crosscat [options] <generator> <modelno>.

        Options:
            --subsample=<n>
            --width=<w>
            --height=<c>
            --rowlabels=<colname>
            --progress=[True|False]
            --yticklabeslize=<fontsize>
            --xticklabeslize=<fontsize>

        The allowable fontsize strings are:
            xx-small, x-small, # small, medium, large, x-large, xx-large
        '''
        tokens = query.split()
        if len(tokens) != 2:
            self.write_stderr('Usage: .render_crosscat <generator> <modelno>')
            return
        generator = tokens[0]
        modelno = int(tokens[1])
        if not bayesdb_has_generator(self._bdb, None, generator):
            self.write_stderr('No such generator: %s.' % (generator, ))
            return
        generator_id = bayesdb_get_generator(self._bdb, None, generator)
        population_id = bayesdb_generator_population(self._bdb, generator_id)
        backend = bayesdb_generator_backend(self._bdb, generator_id)
        if backend.name() != 'cgpm':
            self.write_stderr('.render_crosscat requires generator from the '
                              'cgpm backend')
            return
        engine = backend._engine(self._bdb, generator_id)
        cursor = self._bdb.sql_execute(
            '''
            SELECT cgpm_modelno FROM bayesdb_cgpm_modelno
            WHERE generator_id = ? AND modelno = ?
        ''', (
                generator_id,
                modelno,
            ))
        cgpm_modelno = cursor_value(cursor, nullok=True)
        if cgpm_modelno is None:
            self.write_stderr('No such model number: %d.' % (modelno, ))
            return
        state = engine.get_state(cgpm_modelno)
        row_names = None
        row_index_column = kwargs.get('rowlabels', None)
        if row_index_column is not None:
            table_name = bayesdb_generator_table(self._bdb, generator_id)
            qt = bql_quote_name(table_name)
            qc = bql_quote_name(row_index_column)
            cursor = self._bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid IN (
                    SELECT table_rowid FROM bayesdb_cgpm_individual
                    WHERE generator_id = ?
                )
            ''' % (qc, qt), (generator_id, ))
            row_names = [c[0] for c in cursor]
        if 'progress' in kwargs:
            sys.stdout.write('Creating figure...\n')
        import cgpm.utils.render
        if 'variable' not in kwargs:
            # Plot the entire state.
            col_names = [
                bayesdb_variable_name(self._bdb, population_id, None, colno)
                for colno in state.outputs
            ]
            fig, _ax = cgpm.utils.render.viz_state(state,
                                                   col_names=col_names,
                                                   row_names=row_names,
                                                   **kwargs)
        else:
            # Plot the view of the requested variable.
            varno = bayesdb_variable_number(self._bdb, population_id,
                                            generator_id, kwargs['variable'])
            view = state.view_for(varno)
            col_names = [
                bayesdb_variable_name(self._bdb, population_id, None, colno)
                for colno in view.outputs[1:]
            ]
            fig, _ax = cgpm.utils.render.viz_view(view,
                                                  col_names=col_names,
                                                  row_names=row_names,
                                                  **kwargs)
        (width, height) = fig.get_size_inches()
        if 'width' in kwargs:
            width = float(kwargs['width'])
            fig.set_size_inches(width, height)
        if 'height' in kwargs:
            height = float(kwargs['height'])
            fig.set_size_inches(width, height)
        if 'progress' in kwargs:
            sys.stdout.write('Rendering figure...\n')
Exemplo n.º 35
0
 def _get_ordered_column_labels(self, bdb, generator_id):
     population_id = bayesdb_generator_population(bdb, generator_id)
     return [
         bayesdb_variable_name(bdb, population_id, None, colno)
         for colno in self._get_order(bdb, generator_id)
     ]
Exemplo n.º 36
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table: %s' %
                                (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                            bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s' %
                                    (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table, ))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id, ) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.rename_column(bdb, generator_id,
                                                  old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder(
                '''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt, ), ())
            for cn, st, ct in zip(column_names, stattypes,
                                  distinct_value_counts):
                out.winder(
                    '''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt, ))
            out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [
                    core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids
                ]
                raise BQLError(
                    bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(
                            bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead' %
                            (population, ))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as population'
                                ': %s' % (repr(cmd.name), ))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s' % (
                            qg,
                            qp,
                        ))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'No such variable in base table: %s' % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(bdb, population_id, None,
                                                 cmd.name):
                        raise BQLError(
                            bdb,
                            'Variable already in population: %s' % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'Cannot add variable without any values: %s' %
                            (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute('SELECT %s FROM %s' %
                                                 (qc, qt))
                        rows = cursor.fetchall()
                        [stattype,
                         reason] = bayesdb_guess_stattypes([cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(
                                bdb, 'Values in column %s appear to be keys.' %
                                (cmd.name, ))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(
                                bdb, 'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name, ))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                                       'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'Numerical column contains string values: %r ' %
                            (qc, ))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(bdb, population_id, cmd.name,
                                                  stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population: %s' %
                            (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type: %r' %
                            (repr(cmd.stattype), ))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(
                                bdb, 'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars, ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' % (repr(backend_name), ))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(
                            bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(
                        bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(
                        bdb, 'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb,
                               generator_id,
                               modelnos=phrase.modelnos,
                               iterations=phrase.iterations,
                               max_seconds=phrase.seconds,
                               ckpt_iterations=phrase.ckpt_iterations,
                               ckpt_seconds=phrase.ckpt_seconds,
                               program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                                              phrase.generator):
                raise BQLError(bdb,
                               'No such generator: %r' % (phrase.generator, ))
            generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                      phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(bdb, population_id, None,
                                         phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target, ))
        colno_target = core.bayesdb_variable_number(bdb, population_id, None,
                                                    phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                  generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(
                bdb,
                'Target variable is not numerical: %r' % (phrase.target, ))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(bdb, population_id, None,
                                             given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(colno for colno in colno_givens
                                  if colno != colno_target)
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(bdb,
                                      population_id,
                                      generator_id,
                                      modelnos,
                                      constraints,
                                      colnos,
                                      numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, generator_id,
                                           colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(target_values, given_values, given_names,
                                   stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder(
            '''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt, ), ())
        for variable, coef in coefficients:
            out.winder(
                '''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (
                    variable,
                    coef,
                ))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt, ))
        out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    assert False  # XXX
Exemplo n.º 37
0
    def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets,
            constraints, num_samples=1, accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [
                (colno, rowval)
                for colno, rowval in zip(colnos, rowvals)
                if rowval is not None and colno not in targets
            ]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno] : '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno) : value
            for colno, value in constraints_full
        }
        row = dict(itertools.chain(
            row_targets.iteritems(), row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers]+[csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [
            [_extract_simulated_value(row, colno) for colno in targets]
            for row in simulated_rows
        ]
Exemplo n.º 38
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX
Exemplo n.º 39
0
    def create_generator(self, bdb, generator_id, schema_tokens, **kwargs):
        schema_ast = cgpm_schema.parse.parse(schema_tokens)
        schema = _create_schema(bdb, generator_id, schema_ast, **kwargs)

        # Store the schema.
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_cgpm_generator (generator_id, schema_json)
                VALUES (?, ?)
        ''', (generator_id, json_dumps(schema)))

        # Get the underlying population and table.
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        qt = sqlite3_quote_name(table)

        # Assign latent variable numbers.
        for var, stattype in sorted(schema['latents'].iteritems()):
            core.bayesdb_add_latent(bdb, population_id, generator_id, var,
                                    stattype)

        # Assign codes to categories and consecutive column numbers to
        # the modelled variables.
        vars_cursor = bdb.sql_execute(
            '''
            SELECT colno, name, stattype FROM bayesdb_variable
                WHERE population_id = ? AND 0 <= colno
        ''', (population_id, ))
        for colno, name, stattype in vars_cursor:
            if _is_categorical(stattype):
                qn = sqlite3_quote_name(name)
                cursor = bdb.sql_execute('''
                    SELECT DISTINCT %s FROM %s WHERE %s IS NOT NULL
                ''' % (qn, qt, qn))
                for code, (value, ) in enumerate(cursor):
                    bdb.sql_execute(
                        '''
                        INSERT INTO bayesdb_cgpm_category
                            (generator_id, colno, value, code)
                            VALUES (?, ?, ?, ?)
                    ''', (generator_id, colno, value, code))

        # Assign contiguous 0-indexed ids to the individuals in the
        # table.
        if schema['subsample']:
            k = schema['subsample']
            n = cursor_value(
                bdb.sql_execute('SELECT COUNT(*) FROM %s' % (qt, )))
            cursor = bdb.sql_execute(
                'SELECT _rowid_ FROM %s ORDER BY _rowid_ ASC' % (qt, ))
            uniform = bdb._prng.weakrandom_uniform
            # https://en.wikipedia.org/wiki/Reservoir_sampling
            samples = []
            for i, row in enumerate(cursor):
                if i < k:
                    samples.append(row)
                else:
                    r = uniform(i + 1)
                    if r < k:
                        samples[r] = row
            cursor = samples
        else:
            cursor = bdb.sql_execute('SELECT _rowid_ FROM %s' % (qt, ))
        for cgpm_rowid, (table_rowid, ) in enumerate(cursor):
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_cgpm_individual
                    (generator_id, table_rowid, cgpm_rowid)
                    VALUES (?, ?, ?)
            ''', (generator_id, table_rowid, cgpm_rowid))
Exemplo n.º 40
0
    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)

        # If rowid exists, retrieve conditioning data from the table.
        if rowid != bayesdb_population_fresh_row_id(bdb, generator_id):
            row_values_raw = bayesdb_population_row_values(
                bdb, population_id, rowid)
            row_values = [
                str(a) if isinstance(a, unicode) else a for a in row_values_raw
            ]
            row = [
                entry for entry in enumerate(row_values)
                if entry[1] is not None
            ]
            constraints_colnos = [c[0] for c in constraints]
            row_colnos = [r[0] for r in row]
            if any([colno in constraints_colnos for colno in row_colnos]):
                raise BQLError(bdb, 'Overlap between constraints and' \
                    'target row in simulate.')
            constraints.extend(row)

        # Prepare the query row to provide to Loom.
        row = {}
        target_num_to_name = {}
        for colno in targets:
            name = bayesdb_variable_name(bdb, generator_id, None, colno)
            target_num_to_name[colno] = name
            row[name] = ''
        for (colno, value) in constraints:
            name = bayesdb_variable_name(bdb, generator_id, None, colno)
            row[name] = value

        # Fetch the server.
        server = self._get_cache_entry(bdb, generator_id, 'preql_server')

        # Prepare the csv header.
        csv_headers, csv_values = zip(*row.iteritems())
        lower_to_upper = {str(a).lower(): str(a) for a in csv_headers}
        csv_headers = lower_to_upper.keys()
        csv_values = [str(a) for a in csv_values]

        # Retrieve the samples from the server..
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])
        server._predict(reader, num_samples, writer, False)
        output = writer.result()

        # Parse output.
        returned_headers = [
            lower_to_upper[a]
            for a in output.strip().split('\r\n')[0].split(CSV_DELIMITER)
        ]
        loom_output = [
            zip(returned_headers, a.split(CSV_DELIMITER))
            for a in output.strip().split('\r\n')[1:]
        ]
        return_list = []
        for row in loom_output:
            # Prepare the row.
            row_values = []
            row_dict = dict(row)
            for colno in targets:
                colname = target_num_to_name[colno]
                value = row_dict[colname]
                stattype = bayesdb_variable_stattype(bdb, population_id, None,
                                                     colno)
                if not _is_nominal(stattype):
                    value = float(value)
                row_values.append(value)
            # Add this row to the return list.
            return_list.append(row_values)

        return return_list