Exemplo n.º 1
0
    def __init__(self, bdb, meta_logger=None, session_logger=None):
        bayesdb_schema_required(bdb, 7, 'sessions')
        self.bdb = bdb

        if meta_logger is None:
            meta_logger = BqlLogger()
        self._logger = meta_logger
        if session_logger is None:
            session_logger = CallHomeStatusLogger()
        self._session_logger = session_logger

        self._qid_to_entry_id = {}
        self._sql_tracer = _SessionTracer("sql", self)
        self._bql_tracer = _SessionTracer("bql", self)
        self.start_saving_sessions()
        self._suggested_send = False
        self._start_new_session()
Exemplo n.º 2
0
  def __init__(self, name, csv_path=None, bdb_path=None, df=None, logger=None,
               session_capture_name=None):
    """A set of shortcuts for common ways to use BayesDB.

    name : str  REQUIRED.
        The name of dataset, should use letters and underscores only.
        This will also be used as a table name, and %t in queries will be
        replaced by this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis at this location. If no other
        data source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn, by default a
        bayeslite.loggers.BqlLogger, but could be QuietLogger (only results),
        SilentLogger (nothing), IpyLogger, or LoggingLogger to use those
        modules, or anything else that implements the BqlLogger interface.
    session_capture_name : String
        None by default. If you want to send your session logs (perhaps
        including underlying data!) automatically to the Probabilistic
        Computing group for research purposes, possibly putting those logs and
        data into the public domain, then please set this value to a name that
        we might recognize you by. If the name is not self-evident, then please
        send that name in an email to [email protected] to help us connect your
        sessions to you.
    """
    assert re.match(r'\w+', name)
    assert df is not None or csv_path or bdb_path
    self.name = name
    self.generator_name = name + '_cc'
    self.csv_path = csv_path
    self.df = df
    self.bdb_path = bdb_path
    self.logger = BqlLogger() if logger is None else logger
    self.bdb = None
    self.status = None
    self.session_capture_name = session_capture_name
    self.initialize()
Exemplo n.º 3
0
  def __init__(self, name, csv_path=None, bdb_path=None, df=None, logger=None,
               session_capture_name=None):
    """A set of shortcuts for common ways to use BayesDB.

    name : str  REQUIRED.
        The name of dataset, should use letters and underscores only.
        This will also be used as a table name, and %t in queries will be
        replaced by this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis at this location. If no other
        data source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn
        By default a bayeslite.loggers.BqlLogger, but could be QuietLogger
        (only results), SilentLogger (nothing), IpyLogger, CaptureLogger,
        or LoggingLogger to use those modules, or anything else that
        implements the BqlLogger interface.
    session_capture_name : String
        Signing up with your name and email and sending your session details
        to the MIT Probabilistic Computing Group helps build a community of
        support and helps improve your user experience. You can save your choice
        in a file called 'bayesdb-session-capture-opt.txt' in the directory
        where you run the software, or any parent directory. This option
        overrides any setting in such a file. Any string is interpreted as
        opting in to sending session details. False is interpreted as opting
        out. You must choose. If you choose to use an organization name or
        email, then please send a note to [email protected] to help us connect
        your sessions to you.

        If you encounter a bug, or something surprising, please include your
        session capture name in your report.

        If you opt out, you still allow us to count how often users opt out.

        DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE,
        OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security.
    """
    assert re.match(r'\w+', name)
    assert df is not None or csv_path or bdb_path
    self.name = name
    self.generator_name = name + '_cc'
    self.csv_path = csv_path
    self.df = df
    self.bdb_path = bdb_path
    if logger is None:
      if 'IPython' in sys.modules:
        from bdbcontrib.loggers import IPYTHON_LOGGER as ipy
        self.logger = ipy
      else:
        self.logger = BqlLogger()
    else:
      self.logger = logger
    self.bdb = None
    self.status = None
    self.session_capture_name = None
    self.generators = None
    self.initialize_session_capture(session_capture_name)
    self.initialize()
Exemplo n.º 4
0
class BqlRecipes(object):
  def __init__(self, name, csv_path=None, bdb_path=None, df=None, logger=None,
               session_capture_name=None):
    """A set of shortcuts for common ways to use BayesDB.

    name : str  REQUIRED.
        The name of dataset, should use letters and underscores only.
        This will also be used as a table name, and %t in queries will be
        replaced by this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis at this location. If no other
        data source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn
        By default a bayeslite.loggers.BqlLogger, but could be QuietLogger
        (only results), SilentLogger (nothing), IpyLogger, CaptureLogger,
        or LoggingLogger to use those modules, or anything else that
        implements the BqlLogger interface.
    session_capture_name : String
        Signing up with your name and email and sending your session details
        to the MIT Probabilistic Computing Group helps build a community of
        support and helps improve your user experience. You can save your choice
        in a file called 'bayesdb-session-capture-opt.txt' in the directory
        where you run the software, or any parent directory. This option
        overrides any setting in such a file. Any string is interpreted as
        opting in to sending session details. False is interpreted as opting
        out. You must choose. If you choose to use an organization name or
        email, then please send a note to [email protected] to help us connect
        your sessions to you.

        If you encounter a bug, or something surprising, please include your
        session capture name in your report.

        If you opt out, you still allow us to count how often users opt out.

        DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE,
        OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security.
    """
    assert re.match(r'\w+', name)
    assert df is not None or csv_path or bdb_path
    self.name = name
    self.generator_name = name + '_cc'
    self.csv_path = csv_path
    self.df = df
    self.bdb_path = bdb_path
    if logger is None:
      if 'IPython' in sys.modules:
        from bdbcontrib.loggers import IPYTHON_LOGGER as ipy
        self.logger = ipy
      else:
        self.logger = BqlLogger()
    else:
      self.logger = logger
    self.bdb = None
    self.status = None
    self.session_capture_name = None
    self.generators = None
    self.initialize_session_capture(session_capture_name)
    self.initialize()

  def initialize_session_capture(self, name):
    if self.session_capture_name is not None:
      return
    if name is not None:
      if name == False:
        with logged_query('count-beacon', None, name='single-opt-out'):
          pass
      self.session_capture_name = name
      return
    # Search for a session-capture name or opt-out saved as a file:
    filename = "bayesdb-session-capture-opt.txt"
    searchdir = os.getcwd()
    while searchdir != os.path.dirname(searchdir):  # While not at root.
      try:
        with open(os.path.join(searchdir, filename), 'r') as optinfile:
          self.session_capture_name = optinfile.read()
          if self.session_capture_name == 'False':
            with logged_query('count-beacon', None, name='saved-opt-out'):
              self.session_capture_name = False
          break
      except IOError:
        pass
      searchdir = os.path.dirname(searchdir)
    # No init option specified, no choice file found. Force the choice.
    if self.session_capture_name is None:
      raise BLE(ValueError(
        "Please set session_capture_name option to quickstart\n"
        "  to either opt-in or opt-out of sending details of your usage of\n"
        "  this software to the MIT Probabilistic Computing Group.\n\n"
        "If you see this in one of our example notebooks,\n"
        "  return to the starting page, the Index.ipynb, to\n"
        "  make that choice."))  # TODO: Index.ipynb is a promise.

  def initialize(self):
    if self.bdb:
      return
    self.bdb = bayeslite.bayesdb_open(self.bdb_path)
    if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
      if self.df is not None:
        bayeslite.read_pandas.bayesdb_read_pandas_df(
          self.bdb, self.name, self.df, create=True, ifnotexists=True)
      elif self.csv_path:
        bayeslite.bayesdb_read_csv_file(
          self.bdb, self.name, self.csv_path,
          header=True, create=True, ifnotexists=True)
      else:
        raise BLE(ValueError("No data sources specified, and an empty bdb."))
    self.generators = self.query('''SELECT * FROM bayesdb_generator''')
    if len(self.generators) == 0:
      size = self.query('''SELECT COUNT(*) FROM %t''').ix(0, 0)
      assert 0 < size
      self.query('''
        CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')

  def check_representation(self):
    assert self.bdb, "Did you initialize?"
    assert self.session_capture_name is not None

  def reset(self):
    self.check_representation()
    self.query('drop generator if exists %s' % self.generator_name)
    self.query('drop table if exists %s' % self.name)
    self.bdb = None
    self.initialize()

  @helpsub('bdbcontrib_describe', bdbcontrib.describe_generator_columns.__doc__)
  def quick_describe_columns(self):
    '''Wraps bdbcontrib.describe_generator_columns with bdb and generator name.

    bdbcontrib_describe'''
    self.check_representation()
    return bdbcontrib.describe_generator_columns(self.bdb, self.generator_name)

  help_for_query = (
      """Query the database. Use %t for the data table and %g for the generator.

      %t and %g work only with word boundaries. E.g., 'LIKE "%table%"' is fine.

      Returns a pandas.DataFrame with the results, rather than the cursor that
      the underlying bdb would return, so LIMIT your queries if you need to.
      """)

  def interpret_query(self, query_string):
    '''Replace %t and %g as appropriate.'''
    return re.sub(r'(^|(?<=\s))%t\b',
                  bayeslite.bql_quote_name(self.name),
                  re.sub(r'(^|(?<=\s))%g\b',
                         bayeslite.bql_quote_name(self.generator_name),
                         query_string))

  @helpsub(r'help_for_query', help_for_query)
  def query(self, query_string, *bindings):
    '''Basic querying without session capture or reporting.

    help_for_query'''
    self.check_representation()
    query_string = self.interpret_query(query_string)
    self.logger.info("BQL [%s] [%r]", query_string, bindings)
    with self.bdb.savepoint():
      try:
        res = self.bdb.execute(query_string, bindings)
        assert res is not None and res.description is not None
        self.logger.debug("BQL [%s] [%r] has returned a cursor." %
                          (query_string, bindings))
        df = bdbcontrib.cursor_to_df(res)
        self.logger.debug("BQL [%s] [%r] has created a dataframe." %
                            (query_string, bindings))
        return df
      except:
        self.logger.exception("")

  @helpsub(r'help_for_query', help_for_query)
  def q(self, query_string, *bindings):
    '''help_for_query'''
    with logged_query(query_string, bindings, name=self.session_capture_name):
      return self.query(query_string, *bindings)

  @helpsub('bdbcontrib_nullify_doc', bdbcontrib.nullify.__doc__)
  def nullify(self, value):
    """Wraps bdbcontrib.nullify by passing bdb and name.

    bdbcontrib_nullify_doc"""
    bdbcontrib.nullify(self.bdb, self.name, value)

  def analyze(self, models=100, minutes=0, iterations=0, checkpoint=0):
    '''Run analysis.

    models : integer
        The number of models bounds the accuracy of predictive probabilities.
        With ten models, then you get one decimal digit of interpretability,
        with a hundred models, you get two, and so on.
    minutes : integer
        How long you want to let it run.
    iterations : integer
        How many iterations to let it run.

    Returns:
        A report indicating how many models have seen how many iterations,
        and other info about model stability.
    '''
    self.check_representation()
    with logged_query(query_string='recipes.analyze',
                      name=self.session_capture_name):
      if models > 0:
        self.query('INITIALIZE %d MODELS IF NOT EXISTS FOR %s' %
              (models, self.generator_name))
        assert minutes == 0 or iterations == 0
      else:
        models = self.analysis_status().sum()
      if minutes > 0:
        if checkpoint == 0:
          checkpoint = max(1, int(minutes * models / 200))
        analyzer = ('ANALYZE %s FOR %d MINUTES CHECKPOINT %d ITERATION WAIT' %
                    (self.generator_name, minutes, checkpoint))
        with logged_query(query_string=analyzer,
                          name=self.session_capture_name,
                          bindings=self.query('SELECT * FROM %t')):
          self.query(analyzer)
      elif iterations > 0:
        if checkpoint == 0:
          checkpoint = max(1, int(iterations / 20))
        self.query(
            '''ANALYZE %s FOR %d ITERATIONS CHECKPOINT %d ITERATION WAIT''' % (
              self.generator_name, iterations, checkpoint))
      else:
        raise BLE(NotImplementedError('No default analysis strategy yet.'
                                      ' Please specify minutes or iterations.'))
    # itrs = self.per_model_analysis_status()
    # models_with_fewest_iterations =
    #    itrs[itrs['iterations'] == itrs.min('index').head(0)[0]].index.tolist()
    # TODO(gremio): run each model with as many iterations as it needs to get
    # up to where it needs to get to, if that's larger?
    # Nope. Vikash said there's no reason to think that's a good idea. Perhaps
    # even better to have some young models mixed in with the old ones.
    # I still think we should make some recommendation that scales for what
    # "the right thing" is, where that's something that at least isn't known to
    # suck.

    return self.analysis_status()

  # TODO: remove import plot_utils from the __init__.py file -- make it empty.
  # If you want to use it, you should import bdbcontrib.foo.

  def per_model_analysis_status(self):
    """Return the number of iterations for each model."""
    # XXX Move this to bdbcontrib/src/bql_utils.py ?
    self.check_representation()
    try:
      return self.query('''SELECT iterations FROM bayesdb_generator_model
                           WHERE generator_id = (
                            SELECT id FROM bayesdb_generator WHERE name = ?)''',
                        self.generator_name)
    except ValueError:
      # Because, e.g. there is no generator yet, for an empty db.
      return None

  def analysis_status(self):
    """Return the count of models for each number of iterations run."""
    self.check_representation()
    itrs = self.per_model_analysis_status()
    if itrs is None or len(itrs) == 0:
      emt = pd.DataFrame(columns=['count of model instances'])
      emt.index.name = 'iterations'
      return emt
    vcs = pd.DataFrame(itrs['iterations'].value_counts())
    vcs.index.name = 'iterations'
    vcs.columns = ['count of model instances']
    self.status = vcs
    return vcs

  @helpsub('bdbcontrib_pairplot', bdbcontrib.plot_utils.pairplot.__doc__)
  def pairplot_vars(self, vars, plotfile=None, colorby=None, **kwargs):
    """Wrap bdbcontrib.plot_utils.pairplot to show the given columns.

    Specifies bdb, query with the given columns, and generator_name:
    bdbcontrib_pairplot
    """
    self.check_representation()
    with logged_query(query_string='pairplot_vars=?', bindings=(vars,),
                      name=self.session_capture_name):
      if len(vars) < 1:
        raise BLE(ValueError('Pairplot at least one variable.'))
      qvars = vars if colorby is None else set(vars + [colorby])
      query_columns = '''"%s"''' % '''", "'''.join(qvars)
      self.logger.plot(plotfile,
                       bdbcontrib.pairplot(self.bdb, '''SELECT %s FROM %s''' %
                                             (query_columns, self.name),
                                           generator_name=self.generator_name,
                                           colorby=colorby,
                                           **kwargs))

  def heatmap(self, deps, selectors=None, plotfile=None, **kwargs):
    '''Show heatmaps for the given dependencies

    Parameters
    ----------
    deps : pandas.DataFrame(columns=['generator_id', 'name0', 'name1', 'value'])
        The result of a .q('ESTIMATE ... PAIRWISE ...')
        E.g., DEPENDENCE PROBABILITY, MUTUAL INFORMATION, COVARIANCE, etc.

    selectors : {str: lambda name --> bool}
        Rather than plot the full NxN matrix all together, make separate plots
        for each combination of these selectors, plotting them in sequence.
        If selectors are specified, the actual selector functions are values of
        a dict, and the keys are their names, for purposes of plot legends and
        filenames.
        E.g.,
          {'A-E': lambda x: bool(re.search(r'^[a-eA-E]', x[0])),
           'F-O': lambda x: bool(re.search(r'^[f-oF-O]', x[0])),
           'P-Z': lambda x: bool(re.search(r'^[p-zP-Z]', x[0]))}

    plotfile : str
        If a plotfile is specified, savefig to that file. If selectors are also
        specified, savefig to name1.name2.plotfile.

    **kwargs : dict
        Passed to zmatrix: vmin, vmax, row_ordering, col_ordering
    '''
    self.check_representation()
    with logged_query(query_string='heatmap(deps, selectors)',
                      bindings=(str(deps), repr(selectors)),
                      name=self.session_capture_name):
      hmap = plt.figure()
      if selectors is None:
        cmap = bdbcontrib.plot_utils.heatmap(self.bdb, df=deps, **kwargs)
        self.logger.plot(plotfile, cmap)
      else:
        selfns = [selectors[k] for k in sorted(selectors.keys())]
        reverse = dict([(v, k) for (k, v) in selectors.items()])
        for (cmap, sel1, sel2) in bdbcontrib.plot_utils.selected_heatmaps(
             self.bdb, df=deps, selectors=selfns, **kwargs):
          self.logger.plot("%s.%s.%s" % (
              reverse[sel1], reverse[sel2], plotfile),
                           cmap)
      return hmap

  def quick_explore_vars(self, vars, nsimilar=20, plotfile='explore_vars'):
    """Show dependence probabilities and neighborhoods based on those.

    vars: list of strings
        At least two column names to look at dependence probabilities of,
        and to explore neighborhoods of.
    nsimilar: positive integer
        The size of the neighborhood to explore.
    plotfile: string pathname
        Where to save plots, if not displaying them on console.
    """
    self.check_representation()
    with logged_query(query_string='quick_explore_vars', bindings=(vars,),
                    name=self.session_capture_name):
      if len(vars) < 2:
        raise BLE(ValueError('Need to explore at least two variables.'))
      self.pairplot_vars(vars)
      query_columns = '''"%s"''' % '''", "'''.join(vars)
      deps = self.query('''ESTIMATE DEPENDENCE PROBABILITY
                           FROM PAIRWISE COLUMNS OF %s
                           FOR %s;''' % (self.generator_name, query_columns))
      deps.columns = ['genid', 'name0', 'name1', 'value']
      self.heatmap(deps, plotfile=plotfile)
      deps.columns = ['genid', 'name0', 'name1', 'value']
      triangle = deps[deps['name0'] < deps['name1']]
      triangle = triangle.sort_values(ascending=False, by=['value'])
      self.logger.result("Pairwise dependence probability for: %s\n%s\n\n",
                         query_columns, triangle)

      for col in vars:
        neighborhood = self.query(
        '''ESTIMATE *, DEPENDENCE PROBABILITY WITH "%s"
           AS "Probability of Dependence with %s"
           FROM COLUMNS OF %s
           ORDER BY "Probability of Dependence with %s"
           DESC LIMIT %d;'''
           % (col, col, self.generator_name, col, nsimilar))
        neighbor_columns = ('''"%s"''' %
                            '''", "'''.join(neighborhood["name"].tolist()))
        deps = self.query('''ESTIMATE DEPENDENCE PROBABILITY
            FROM PAIRWISE COLUMNS OF %s
            FOR %s;''' % (self.generator_name, neighbor_columns))
        deps.columns = ['genid', 'name0', 'name1', 'value']
        self.heatmap(deps, plotfile=(plotfile + "-" + col))
        self.logger.result("Pairwise dependence probability of %s with its " +
                           "strongest dependents:\n%s\n\n", col, neighborhood)

  def column_type(self, col):
    """The statistical type of the given column in the current model."""
    self.check_representation()
    descriptions = self.quick_describe_columns()
    return descriptions[descriptions['name'] == col]['stattype'].iloc[0]

  def sql_tracing(self, turn_on=True):
    """Trace underlying SQL, for debugging."""
    self.check_representation()
    # Always turn off:
    self.bdb.sql_untrace(self.bdb.sql_tracer)
    if turn_on:
      printer = lambda query, bindings: self.logger.info(
          "query: [%s] bindings: [%s]\n\n", query, bindings)
      self.bdb.sql_trace(printer)

  def quick_similar_rows(self, identify_row_by, nsimilar=10):
    """Explore rows similar to the identified one.

    identify_row_by : dict
        Dictionary of column names to their values. These will be turned into
        a WHERE clause in BQL, and must identify one unique row.
    nsimilar : positive integer
        The number of similar rows to retrieve.
    """
    self.check_representation()
    with logged_query(query_string='quick_similar_rows(id_by, n)',
                      bindings=(identify_row_by, nsimilar),
                      name=self.session_capture_name):
      import hashlib
      table_name = 'tmptbl_' + hashlib.md5('\x00'.join(
          [repr(identify_row_by), str(self.status)])).hexdigest()
      column_name = 'similarity_to_' + "__".join(
          re.sub(r'\W', '_', str(val)) for val in identify_row_by.values())
      query_params = []
      query_columns = []
      for k, v in identify_row_by.iteritems():
        query_columns.append('''%s = ? ''' % bayeslite.bql_quote_name(k))
        query_params.append(v)
      query_attrs = ' and '.join(query_columns)

      with self.bdb.savepoint():
        row_exists = self.query('SELECT COUNT(*) FROM %s WHERE %s;' %
                                (self.name, query_attrs))
        if row_exists.ix[0][0] != 1:
          raise BLE(NotImplementedError(
              'identify_row_by found %d rows instead of exactly 1 in %s.' %
              (row_exists.ix[0][0], self.csv_path)))
        creation_query = ('''CREATE TEMP TABLE IF NOT EXISTS %s AS ESTIMATE *,
                             SIMILARITY TO (%s) AS %s FROM %%g LIMIT %d;''' %
                          (table_name, query_attrs, column_name, nsimilar))
        self.query(creation_query, query_params)
        result = self.query('''SELECT * FROM %s ORDER BY %s DESC;''' %
                            (table_name, column_name))
      return result
Exemplo n.º 5
0
    def __init__(self,
                 name,
                 csv_path=None,
                 bdb_path=None,
                 df=None,
                 logger=None,
                 session_capture_name=None):
        """Create a Population object, wrapping a bayeslite.BayesDB.

    name : str  REQUIRED.
        A name for the population, should use letters and underscores only.
        This will also be used as a table name in the bdb, and %t in queries
        will expand to this name. %g in queries will expand to the current
        population metamodel, also based on this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis results here. If no other data
        source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn to send messages to the
        user. By default a bayeslite.loggers.BqlLogger, but could be QuietLogger
        (only results), SilentLogger (nothing), IpyLogger, CaptureLogger,
        LoggingLogger, or anything else that implements the BqlLogger interface.
    session_capture_name : String
        Signing up with your name and email and sending your session details
        to the MIT Probabilistic Computing Group helps build a community of
        support and helps improve your user experience. You can save your choice
        in a file called 'bayesdb-session-capture-opt.txt' in the directory
        where you run the software, or any parent directory. This option
        overrides any setting in such a file. Any string is interpreted as
        opting in to sending session details. False is interpreted as opting
        out. You must choose. If you choose to use an organization name or
        email, then please send a note to [email protected] to help us connect
        your sessions to you.

        If you encounter a bug, or something surprising, please include your
        session capture name in your report.

        If you opt out, you still allow us to count how often users opt out.

        DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE,
        OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security.
    """
        Population.method_imports()
        assert re.match(r'\w+', name)
        assert df is not None or csv_path or bdb_path
        self.name = name
        self.generator_name = name + '_cc'  # Because we use the default metamodel.
        self.csv_path = csv_path
        self.df = df
        self.bdb_path = bdb_path
        if logger is None:
            if 'IPython' in sys.modules:
                from bdbcontrib.loggers import IPYTHON_LOGGER as ipy
                self.logger = ipy
            else:
                self.logger = BqlLogger()
        else:
            self.logger = logger
        self.bdb = None
        self.status = None
        self.session_capture_name = None
        self.generators = []
        with logged_query('count-beacon', None, name='count-beacon'):
            self.initialize_session_capture(session_capture_name)
        self.initialize()
Exemplo n.º 6
0
class Population(object):
    """Generative Population Model, wraps a BayesDB, and tracks one population.

  See Population.help() for a short menu of available methods.
  See help(Population) as usual for more complete information.
  """

    shortdocs = []

    @classmethod
    def method_imports(cls):
        """Runs decorators that add methods to Population."""
        # Set up a place for methods to deposit their short documentations for help:
        cls.shortdocs = []

        # These are here rather than in, say __init__.py so doing import bdbcontrib
        # just to get its __version__ for example doesn't need to run all that code.
        # __init__.py does import population (this file) for Population's __doc__
        # (same as this class's __init__.__doc__) so these can't just be at top.
        # But once you're using Populations, you have to pay this price once.
        import bql_utils
        import plot_utils
        import recipes
        import diagnostic_utils
        import crosscat_utils
        # Convenience alias:
        cls.q = cls.query
        cls.vartype = cls.get_column_stattype
        cls.quick_describe_columns = cls.variable_stattypes

    def __init__(self,
                 name,
                 csv_path=None,
                 bdb_path=None,
                 df=None,
                 logger=None,
                 session_capture_name=None):
        """Create a Population object, wrapping a bayeslite.BayesDB.

    name : str  REQUIRED.
        A name for the population, should use letters and underscores only.
        This will also be used as a table name in the bdb, and %t in queries
        will expand to this name. %g in queries will expand to the current
        population metamodel, also based on this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis results here. If no other data
        source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn to send messages to the
        user. By default a bayeslite.loggers.BqlLogger, but could be QuietLogger
        (only results), SilentLogger (nothing), IpyLogger, CaptureLogger,
        LoggingLogger, or anything else that implements the BqlLogger interface.
    session_capture_name : String
        Signing up with your name and email and sending your session details
        to the MIT Probabilistic Computing Group helps build a community of
        support and helps improve your user experience. You can save your choice
        in a file called 'bayesdb-session-capture-opt.txt' in the directory
        where you run the software, or any parent directory. This option
        overrides any setting in such a file. Any string is interpreted as
        opting in to sending session details. False is interpreted as opting
        out. You must choose. If you choose to use an organization name or
        email, then please send a note to [email protected] to help us connect
        your sessions to you.

        If you encounter a bug, or something surprising, please include your
        session capture name in your report.

        If you opt out, you still allow us to count how often users opt out.

        DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE,
        OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security.
    """
        Population.method_imports()
        assert re.match(r'\w+', name)
        assert df is not None or csv_path or bdb_path
        self.name = name
        self.generator_name = name + '_cc'  # Because we use the default metamodel.
        self.csv_path = csv_path
        self.df = df
        self.bdb_path = bdb_path
        if logger is None:
            if 'IPython' in sys.modules:
                from bdbcontrib.loggers import IPYTHON_LOGGER as ipy
                self.logger = ipy
            else:
                self.logger = BqlLogger()
        else:
            self.logger = logger
        self.bdb = None
        self.status = None
        self.session_capture_name = None
        self.generators = []
        with logged_query('count-beacon', None, name='count-beacon'):
            self.initialize_session_capture(session_capture_name)
        self.initialize()

    def initialize_session_capture(self, name):
        if self.session_capture_name is not None:
            return
        if name is not None:
            self.session_capture_name = name
            return
        # Search for a session-capture name or opt-out saved as a file:
        searchdir = os.getcwd()
        while searchdir != os.path.dirname(searchdir):  # While not at root.
            try:
                with open(os.path.join(searchdir, OPTFILE), 'r') as optinfile:
                    self.session_capture_name = optinfile.read()
                    if self.session_capture_name == 'False':
                        self.session_capture_name = False
                    break
            except IOError:
                pass
            searchdir = os.path.dirname(searchdir)
        # No init option specified, no choice file found. Force the choice.
        if self.session_capture_name is None:
            raise BLE(
                "Please set session_capture_name option to Population.__init__\n"
                "  to either opt-in or opt-out of sending details of your usage of\n"
                "  this software to the MIT Probabilistic Computing Group.\n\n"
                "If you see this in one of our example notebooks,\n"
                "  return to the starting page, the Index.ipynb, to\n"
                "  make that choice.")

    def initialize(self):
        if self.bdb:
            self.check_representation()
            return
        self.bdb = bayeslite.bayesdb_open(self.bdb_path)
        if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
            if self.df is not None:
                bayeslite.read_pandas.bayesdb_read_pandas_df(self.bdb,
                                                             self.name,
                                                             self.df,
                                                             create=True,
                                                             ifnotexists=True)
            elif self.csv_path:
                bayeslite.bayesdb_read_csv_file(self.bdb,
                                                self.name,
                                                self.csv_path,
                                                header=True,
                                                create=True,
                                                ifnotexists=True)
            else:
                tables = self.list_tables()
                metamodels = self.list_metamodels()
                if len(tables) + len(metamodels) == 0:
                    raise BLE(
                        ValueError(
                            "No data sources specified, and an empty bdb."))
                else:
                    raise BLE(
                        ValueError(
                            "The name of the population must be the same"
                            " as a table in the bdb, one of: " +
                            ", ".join(tables) +
                            "\nNote also that the bdb has the following"
                            " metamodels defined: " + ", ".join(metamodels)))
        self.generators = self.query('''SELECT * FROM bayesdb_generator''')
        if len(self.generators) == 0:
            size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0]
            assert 0 < size
            self.query('''
        CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
        self.check_representation()

    def check_representation(self):
        assert self.bdb, "Did you initialize?"
        assert self.session_capture_name is not None

    def interpret_bql(self, query_string):
        '''Replace %t and %g as appropriate.'''
        return re.sub(
            r'(^|(?<=\s))%t\b', bayeslite.bql_quote_name(self.name),
            re.sub(r'(^|(?<=\s))%g\b',
                   bayeslite.bql_quote_name(self.generator_name),
                   query_string))

    def sql_tracing(self, turn_on=True):
        """Trace underlying SQL, for debugging."""
        self.check_representation()
        # Always turn off:
        self.bdb.sql_untrace(self.bdb.sql_tracer)
        if turn_on:
            printer = lambda query, bindings: self.logger.info(
                "query: [%s] bindings: [%s]\n\n", query, bindings)
            self.bdb.sql_trace(printer)

    def reset(self):
        self.check_representation()
        self.query('drop generator if exists %s' % self.generator_name)
        self.query('drop table if exists %s' % self.name)
        self.bdb = None
        self.initialize()

    def specifier_to_df(self, spec):
        if isinstance(spec, pd.DataFrame):
            return spec
        else:
            return self.query(spec)

    def help(self, filter=None):
        """Show a short menu of available methods.

    filter : string or re
        Show only methods whose descriptions match the given pattern.
    """
        response = self.shortdocs
        if filter is not None:
            if hasattr(filter, '__call__'):
                response = response.filter(filter)
            else:
                response = [r for r in response if re.search(filter, r)]
        print '\n'.join(response)
Exemplo n.º 7
0
class Population(object):
  """Generative Population Model, wraps a BayesDB, and tracks one population."""

  @classmethod
  def method_imports(cls):
    """Runs decorators that add methods to Population."""
    # These are here rather than in, say __init__.py so doing import bdbcontrib
    # just to get its __version__ for example doesn't need to run all that code.
    # __init__.py does import population (this file) for quickstart's __doc__
    # (same as this class's __init__.__doc__) so these can't just be at top.
    # But once you're using Populations, you have to pay this price once.
    import bql_utils
    import plot_utils
    import recipes
    import diagnostic_utils
    import crosscat_utils
    # Convenience alias:
    cls.q = cls.query

  def __init__(self, name, csv_path=None, bdb_path=None, df=None, logger=None,
               session_capture_name=None):
    """Create a Population object, wrapping a bayeslite.BayesDB.

    name : str  REQUIRED.
        A name for the population, should use letters and underscores only.
        This will also be used as a table name in the bdb, and %t in queries
        will expand to this name. %g in queries will expand to the current
        population metamodel, also based on this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis results here. If no other data
        source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn to send messages to the
        user. By default a bayeslite.loggers.BqlLogger, but could be QuietLogger
        (only results), SilentLogger (nothing), IpyLogger, CaptureLogger,
        LoggingLogger, or anything else that implements the BqlLogger interface.
    session_capture_name : String
        Signing up with your name and email and sending your session details
        to the MIT Probabilistic Computing Group helps build a community of
        support and helps improve your user experience. You can save your choice
        in a file called 'bayesdb-session-capture-opt.txt' in the directory
        where you run the software, or any parent directory. This option
        overrides any setting in such a file. Any string is interpreted as
        opting in to sending session details. False is interpreted as opting
        out. You must choose. If you choose to use an organization name or
        email, then please send a note to [email protected] to help us connect
        your sessions to you.

        If you encounter a bug, or something surprising, please include your
        session capture name in your report.

        If you opt out, you still allow us to count how often users opt out.

        DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE,
        OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security.
    """
    Population.method_imports()
    assert re.match(r'\w+', name)
    assert df is not None or csv_path or bdb_path
    self.name = name
    self.generator_name = name + '_cc' # Because we use the default metamodel.
    self.csv_path = csv_path
    self.df = df
    self.bdb_path = bdb_path
    if logger is None:
      if 'IPython' in sys.modules:
        from bdbcontrib.loggers import IPYTHON_LOGGER as ipy
        self.logger = ipy
      else:
        self.logger = BqlLogger()
    else:
      self.logger = logger
    self.bdb = None
    self.status = None
    self.session_capture_name = None
    self.generators = []
    with logged_query('count-beacon', None, name='count-beacon'):
      self.initialize_session_capture(session_capture_name)
    self.initialize()

  def initialize_session_capture(self, name):
    if self.session_capture_name is not None:
      return
    if name is not None:
      self.session_capture_name = name
      return
    # Search for a session-capture name or opt-out saved as a file:
    filename = "bayesdb-session-capture-opt.txt"
    searchdir = os.getcwd()
    while searchdir != os.path.dirname(searchdir):  # While not at root.
      try:
        with open(os.path.join(searchdir, filename), 'r') as optinfile:
          self.session_capture_name = optinfile.read()
          if self.session_capture_name == 'False':
            self.session_capture_name = False
          break
      except IOError:
        pass
      searchdir = os.path.dirname(searchdir)
    # No init option specified, no choice file found. Force the choice.
    if self.session_capture_name is None:
      raise BLE(
        "Please set session_capture_name option to quickstart\n"
        "  to either opt-in or opt-out of sending details of your usage of\n"
        "  this software to the MIT Probabilistic Computing Group.\n\n"
        "If you see this in one of our example notebooks,\n"
        "  return to the starting page, the Index.ipynb, to\n"
        "  make that choice.")

  def initialize(self):
    if self.bdb:
      self.check_representation()
      return
    self.bdb = bayeslite.bayesdb_open(self.bdb_path)
    if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
      if self.df is not None:
        bayeslite.read_pandas.bayesdb_read_pandas_df(
          self.bdb, self.name, self.df, create=True, ifnotexists=True)
      elif self.csv_path:
        bayeslite.bayesdb_read_csv_file(
          self.bdb, self.name, self.csv_path,
          header=True, create=True, ifnotexists=True)
      else:
        raise BLE(ValueError("No data sources specified, and an empty bdb."))
    self.generators = self.query('''SELECT * FROM bayesdb_generator''')
    if len(self.generators) == 0:
      size = self.query('''SELECT COUNT(*) FROM %t''').ix(0, 0)
      assert 0 < size
      self.query('''
        CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
    self.check_representation()

  def check_representation(self):
    assert self.bdb, "Did you initialize?"
    assert self.session_capture_name is not None

  def interpret_bql(self, query_string):
    '''Replace %t and %g as appropriate.'''
    return re.sub(r'(^|(?<=\s))%t\b',
                  bayeslite.bql_quote_name(self.name),
                  re.sub(r'(^|(?<=\s))%g\b',
                         bayeslite.bql_quote_name(self.generator_name),
                         query_string))

  def analyze(self, models=100, minutes=0, iterations=0, checkpoint=0):
    '''Run analysis.

    models : integer
        The number of models bounds the accuracy of predictive probabilities.
        With ten models, then you get one decimal digit of interpretability,
        with a hundred models, you get two, and so on.
    minutes : integer
        How long you want to let it run.
    iterations : integer
        How many iterations to let it run.

    Returns:
        A report indicating how many models have seen how many iterations,
        and other info about model stability.
    '''
    self.check_representation()
    with logged_query(query_string='recipes.analyze',
                      name=self.session_capture_name):
      if models > 0:
        self.query('INITIALIZE %d MODELS IF NOT EXISTS FOR %s' %
              (models, self.generator_name))
        assert minutes == 0 or iterations == 0
      else:
        models = self.analysis_status().sum()
      if minutes > 0:
        if checkpoint == 0:
          checkpoint = max(1, int(minutes * models / 200))
        analyzer = ('ANALYZE %s FOR %d MINUTES CHECKPOINT %d ITERATION WAIT' %
                    (self.generator_name, minutes, checkpoint))
        with logged_query(query_string=analyzer,
                          name=self.session_capture_name,
                          bindings=self.query('SELECT * FROM %t')):
          self.query(analyzer)
      elif iterations > 0:
        if checkpoint == 0:
          checkpoint = max(1, int(iterations / 20))
        self.query(
            '''ANALYZE %s FOR %d ITERATIONS CHECKPOINT %d ITERATION WAIT''' % (
                self.generator_name, iterations, checkpoint))
      else:
        raise NotImplementedError('No default analysis strategy yet. '
                                  'Please specify minutes or iterations.')
    # itrs = self.per_model_analysis_status()
    # models_with_fewest_iterations =
    #    itrs[itrs['iterations'] == itrs.min('index').head(0)[0]].index.tolist()
    # TODO(gremio): run each model with as many iterations as it needs to get
    # up to where it needs to get to, if that's larger?
    # Nope. Vikash said there's no reason to think that's a good idea. Perhaps
    # even better to have some young models mixed in with the old ones.
    # I still think we should make some recommendation that scales for what
    # "the right thing" is, where that's something that at least isn't known to
    # suck.

    return self.analysis_status()

  def per_model_analysis_status(self):
    """Return the number of iterations for each model."""
    # XXX Move this to bdbcontrib/src/bql_utils.py ?
    self.check_representation()
    try:
      return self.query('''SELECT iterations FROM bayesdb_generator_model
                           WHERE generator_id = (
                            SELECT id FROM bayesdb_generator WHERE name = ?)''',
                        (self.generator_name,))
    except ValueError:
      # Because, e.g. there is no generator yet, for an empty db.
      return None

  def analysis_status(self):
    """Return the count of models for each number of iterations run."""
    self.check_representation()
    itrs = self.per_model_analysis_status()
    if itrs is None or len(itrs) == 0:
      emt = pd.DataFrame(columns=['count of model instances'])
      emt.index.name = 'iterations'
      return emt
    vcs = pd.DataFrame(itrs['iterations'].value_counts())
    vcs.index.name = 'iterations'
    vcs.columns = ['count of model instances']
    self.status = vcs
    return vcs

  def sql_tracing(self, turn_on=True):
    """Trace underlying SQL, for debugging."""
    self.check_representation()
    # Always turn off:
    self.bdb.sql_untrace(self.bdb.sql_tracer)
    if turn_on:
      printer = lambda query, bindings: self.logger.info(
          "query: [%s] bindings: [%s]\n\n", query, bindings)
      self.bdb.sql_trace(printer)

  def reset(self):
    self.check_representation()
    self.query('drop generator if exists %s' % self.generator_name)
    self.query('drop table if exists %s' % self.name)
    self.bdb = None
    self.initialize()

  def specifier_to_df(self, spec):
    if isinstance(spec, pd.DataFrame):
      return spec
    else:
      return self.query(spec)
Exemplo n.º 8
0
class Population(object):
  """Generative Population Model, wraps a BayesDB, and tracks one population.

  See Population.help() for a short menu of available methods.
  See help(Population) as usual for more complete information.
  """

  shortdocs = []

  @classmethod
  def method_imports(cls):
    """Runs decorators that add methods to Population."""
    # Set up a place for methods to deposit their short documentations for help:
    cls.shortdocs = []

    # These are here rather than in, say __init__.py so doing import bdbcontrib
    # just to get its __version__ for example doesn't need to run all that code.
    # __init__.py does import population (this file) for Population's __doc__
    # (same as this class's __init__.__doc__) so these can't just be at top.
    # But once you're using Populations, you have to pay this price once.
    import bql_utils
    import plot_utils
    import recipes
    import diagnostic_utils
    import crosscat_utils
    # Convenience alias:
    cls.q = cls.query
    cls.vartype = cls.get_column_stattype
    cls.quick_describe_columns = cls.variable_stattypes

  def __init__(self, name, csv_path=None, bdb_path=None, df=None, logger=None,
               session_capture_name=None):
    """Create a Population object, wrapping a bayeslite.BayesDB.

    name : str  REQUIRED.
        A name for the population, should use letters and underscores only.
        This will also be used as a table name in the bdb, and %t in queries
        will expand to this name. %g in queries will expand to the current
        population metamodel, also based on this name.
    csv_path : str
        The path to a comma-separated values file. If specified, will be used
        to populate the bdb. It must exist and be both readable and non-empty.
    df : pandas.DataFrame
        If specified, these data will be used to populate the bdb, superseding
        any csv_path. It must not be empty.
    bdb_path : str
        If specified, store data and analysis results here. If no other data
        source (csv or df) is specified, then it must already have been
        populated. If not specified, we will use a volatile in-memory bdb.
    logger : object
        Something on which we can call .info or .warn to send messages to the
        user. By default a bayeslite.loggers.BqlLogger, but could be QuietLogger
        (only results), SilentLogger (nothing), IpyLogger, CaptureLogger,
        LoggingLogger, or anything else that implements the BqlLogger interface.
    session_capture_name : String
        Signing up with your name and email and sending your session details
        to the MIT Probabilistic Computing Group helps build a community of
        support and helps improve your user experience. You can save your choice
        in a file called 'bayesdb-session-capture-opt.txt' in the directory
        where you run the software, or any parent directory. This option
        overrides any setting in such a file. Any string is interpreted as
        opting in to sending session details. False is interpreted as opting
        out. You must choose. If you choose to use an organization name or
        email, then please send a note to [email protected] to help us connect
        your sessions to you.

        If you encounter a bug, or something surprising, please include your
        session capture name in your report.

        If you opt out, you still allow us to count how often users opt out.

        DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE,
        OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security.
    """
    Population.method_imports()
    assert re.match(r'\w+', name)
    assert df is not None or csv_path or bdb_path
    self.name = name
    self.generator_name = name + '_cc' # Because we use the default metamodel.
    self.csv_path = csv_path
    self.df = df
    self.bdb_path = bdb_path
    if logger is None:
      if 'IPython' in sys.modules:
        from bdbcontrib.loggers import IPYTHON_LOGGER as ipy
        self.logger = ipy
      else:
        self.logger = BqlLogger()
    else:
      self.logger = logger
    self.bdb = None
    self.status = None
    self.session_capture_name = None
    self.generators = []
    with logged_query('count-beacon', None, name='count-beacon'):
      self.initialize_session_capture(session_capture_name)
    self.initialize()

  def initialize_session_capture(self, name):
    if self.session_capture_name is not None:
      return
    if name is not None:
      self.session_capture_name = name
      return
    # Search for a session-capture name or opt-out saved as a file:
    searchdir = os.getcwd()
    while searchdir != os.path.dirname(searchdir):  # While not at root.
      try:
        with open(os.path.join(searchdir, OPTFILE), 'r') as optinfile:
          self.session_capture_name = optinfile.read()
          if self.session_capture_name == 'False':
            self.session_capture_name = False
          break
      except IOError:
        pass
      searchdir = os.path.dirname(searchdir)
    # No init option specified, no choice file found. Force the choice.
    if self.session_capture_name is None:
      raise BLE(
        "Please set session_capture_name option to Population.__init__\n"
        "  to either opt-in or opt-out of sending details of your usage of\n"
        "  this software to the MIT Probabilistic Computing Group.\n\n"
        "If you see this in one of our example notebooks,\n"
        "  return to the starting page, the Index.ipynb, to\n"
        "  make that choice.")

  def initialize(self):
    if self.bdb:
      self.check_representation()
      return
    self.bdb = bayeslite.bayesdb_open(self.bdb_path)
    if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
      if self.df is not None:
        bayeslite.read_pandas.bayesdb_read_pandas_df(
          self.bdb, self.name, self.df, create=True, ifnotexists=True)
      elif self.csv_path:
        bayeslite.bayesdb_read_csv_file(
          self.bdb, self.name, self.csv_path,
          header=True, create=True, ifnotexists=True)
      else:
        tables = self.list_tables()
        metamodels = self.list_metamodels()
        if len(tables) + len(metamodels) == 0:
          raise BLE(ValueError("No data sources specified, and an empty bdb."))
        else:
          raise BLE(ValueError("The name of the population must be the same"
                               " as a table in the bdb, one of: " +
                               ", ".join(tables) +
                               "\nNote also that the bdb has the following"
                               " metamodels defined: " + ", ".join(metamodels)))
    self.generators = self.query('''SELECT * FROM bayesdb_generator''')
    if len(self.generators) == 0:
      size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0]
      assert 0 < size
      self.query('''
        CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
    self.check_representation()

  def check_representation(self):
    assert self.bdb, "Did you initialize?"
    assert self.session_capture_name is not None

  def interpret_bql(self, query_string):
    '''Replace %t and %g as appropriate.'''
    return re.sub(r'(^|(?<=\s))%t\b',
                  bayeslite.bql_quote_name(self.name),
                  re.sub(r'(^|(?<=\s))%g\b',
                         bayeslite.bql_quote_name(self.generator_name),
                         query_string))

  def sql_tracing(self, turn_on=True):
    """Trace underlying SQL, for debugging."""
    self.check_representation()
    # Always turn off:
    self.bdb.sql_untrace(self.bdb.sql_tracer)
    if turn_on:
      printer = lambda query, bindings: self.logger.info(
          "query: [%s] bindings: [%s]\n\n", query, bindings)
      self.bdb.sql_trace(printer)

  def reset(self):
    self.check_representation()
    self.query('drop generator if exists %s' % self.generator_name)
    self.query('drop table if exists %s' % self.name)
    self.bdb = None
    self.initialize()

  def specifier_to_df(self, spec):
    if isinstance(spec, pd.DataFrame):
      return spec
    else:
      return self.query(spec)

  def help(self, filter=None):
    """Show a short menu of available methods.

    filter : string or re
        Show only methods whose descriptions match the given pattern.
    """
    response = self.shortdocs
    if filter is not None:
      if hasattr(filter, '__call__'):
        response = response.filter(filter)
      else:
        response = [r for r in response if re.search(filter, r)]
    print '\n'.join(response)