def initialize_session_capture(self, name): if self.session_capture_name is not None: return if name is not None: if name == False: with logged_query('count-beacon', None, name='single-opt-out'): pass self.session_capture_name = name return # Search for a session-capture name or opt-out saved as a file: filename = "bayesdb-session-capture-opt.txt" searchdir = os.getcwd() while searchdir != os.path.dirname(searchdir): # While not at root. try: with open(os.path.join(searchdir, filename), 'r') as optinfile: self.session_capture_name = optinfile.read() if self.session_capture_name == 'False': with logged_query('count-beacon', None, name='saved-opt-out'): self.session_capture_name = False break except IOError: pass searchdir = os.path.dirname(searchdir) # No init option specified, no choice file found. Force the choice. if self.session_capture_name is None: raise BLE(ValueError( "Please set session_capture_name option to quickstart\n" " to either opt-in or opt-out of sending details of your usage of\n" " this software to the MIT Probabilistic Computing Group.\n\n" "If you see this in one of our example notebooks,\n" " return to the starting page, the Index.ipynb, to\n" " make that choice.")) # TODO: Index.ipynb is a promise.
def analyze(self, models=100, minutes=0, iterations=0, checkpoint=0): '''Run analysis. models : integer The number of models bounds the accuracy of predictive probabilities. With ten models, then you get one decimal digit of interpretability, with a hundred models, you get two, and so on. minutes : integer How long you want to let it run. iterations : integer How many iterations to let it run. Returns: A report indicating how many models have seen how many iterations, and other info about model stability. ''' self.check_representation() with logged_query(query_string='recipes.analyze', name=self.session_capture_name): if models > 0: self.query('INITIALIZE %d MODELS IF NOT EXISTS FOR %s' % (models, self.generator_name)) assert minutes == 0 or iterations == 0 else: models = self.analysis_status().sum() if minutes > 0: if checkpoint == 0: checkpoint = max(1, int(minutes * models / 200)) analyzer = ('ANALYZE %s FOR %d MINUTES CHECKPOINT %d ITERATION WAIT' % (self.generator_name, minutes, checkpoint)) with logged_query(query_string=analyzer, name=self.session_capture_name, bindings=self.query('SELECT * FROM %t')): self.query(analyzer) elif iterations > 0: if checkpoint == 0: checkpoint = max(1, int(iterations / 20)) self.query( '''ANALYZE %s FOR %d ITERATIONS CHECKPOINT %d ITERATION WAIT''' % ( self.generator_name, iterations, checkpoint)) else: raise BLE(NotImplementedError('No default analysis strategy yet.' ' Please specify minutes or iterations.')) # itrs = self.per_model_analysis_status() # models_with_fewest_iterations = # itrs[itrs['iterations'] == itrs.min('index').head(0)[0]].index.tolist() # TODO(gremio): run each model with as many iterations as it needs to get # up to where it needs to get to, if that's larger? # Nope. Vikash said there's no reason to think that's a good idea. Perhaps # even better to have some young models mixed in with the old ones. # I still think we should make some recommendation that scales for what # "the right thing" is, where that's something that at least isn't known to # suck. return self.analysis_status()
def test_logged_query_dataframe(): from pandas import DataFrame df = DataFrame({ 'a': [1, 2.3, -4], # complex(4, -5)], # Complex is broken, even with the default # handler special case. # See https://github.com/pydata/pandas/issues/12554 'b': [float('nan'), None, 'N/A'] }) query_stub = StubCallable() post_stub = StubCallable() lgr = loggers.CallHomeStatusLogger(post=post_stub) with loggers.logged_query(logger=lgr, query_string='q', bindings=(df, ), name='n'): query_stub('inside') assert 1 == len(query_stub.calls) assert "(('inside',), {})" == str(query_stub.calls[0]) time.sleep(0.2) # To let the call-home thread run, so this is less flaky. assert 1 == len(post_stub.calls) posted = post_stub.calls[0][1]['data']['session_json'] data = json.loads(posted) df = data['entries'][0][2][1] assert [ '{"a":{"0":1.0,"1":2.3,"2":-4.0},' # {"mathjs":"Complex","re":4,"im":-5}},' Complex broken. See above. '"b":{"0":null,"1":null,"2":"N\/A"}}' ] == df
def test_logged_query_no_name(): stubc = StubCallable() lgr = loggers.CallHomeStatusLogger(post=stubc) with loggers.logged_query(query_string='q', bindings=('b', ), logger=lgr): stubc('inside') # Just the one call: nothing posted. assert "[(('inside',), {})]" == str(stubc.calls)
def test_logged_query_no_name(): stubc = StubCallable() lgr = loggers.CallHomeStatusLogger(post=stubc) with loggers.logged_query(query_string='q', bindings=('b',), logger=lgr): stubc('inside') # Just the one call: nothing posted. assert "[(('inside',), {})]" == str(stubc.calls)
def analyze(self, models=100, minutes=0, iterations=0, checkpoint=0, generator_name=None): '''Run analysis. models : integer The number of models bounds the accuracy of predictive probabilities. With ten models, then you get one decimal digit of interpretability, with a hundred models, you get two, and so on. minutes : integer How long you want to let it run. iterations : integer How many iterations to let it run. Returns: A report indicating how many models have seen how many iterations, and other info about model stability. ''' assert generator_name is not None if models > 0: self.query('INITIALIZE %d MODELS IF NOT EXISTS FOR %s' % (models, generator_name)) assert minutes == 0 or iterations == 0 else: models = self.analysis_status(generator_name=generator_name).sum() if minutes > 0: if checkpoint == 0: checkpoint = max(1, int(minutes * models / 200)) analyzer = ( 'ANALYZE %s FOR %d MINUTES CHECKPOINT %d ITERATION WAIT' % (generator_name, minutes, checkpoint)) with logged_query(query_string=analyzer, name=self.session_capture_name, bindings=self.query('SELECT * FROM %t')): self.query(analyzer) elif iterations > 0: if checkpoint == 0: checkpoint = max(1, int(iterations / 20)) self.query( '''ANALYZE %s FOR %d ITERATIONS CHECKPOINT %d ITERATION WAIT''' % (generator_name, iterations, checkpoint)) else: raise NotImplementedError('No default analysis strategy yet. ' 'Please specify minutes or iterations.') # itrs = self.per_model_analysis_status() # models_with_fewest_iterations = # itrs[itrs['iterations'] == itrs.min('index').head(0)[0]].index.tolist() # TODO(gremio): run each model with as many iterations as it needs to get # up to where it needs to get to, if that's larger? # Nope. Vikash said there's no reason to think that's a good idea. Perhaps # even better to have some young models mixed in with the old ones. # I still think we should make some recommendation that scales for what # "the right thing" is, where that's something that at least isn't known to # suck. return self.analysis_status(generator_name=generator_name)
def test_logged_query_success(): query_stub = StubCallable() post_stub = StubCallable() lgr = loggers.CallHomeStatusLogger(post=post_stub) with loggers.logged_query(logger=lgr, **THE_USUAL): query_stub('inside') assert 1 == len(query_stub.calls) assert "(('inside',), {})" == str(query_stub.calls[0]) time.sleep(0.2) # To let the call-home thread run, so this is less flaky. assert 1 == len(post_stub.calls) check_logcall(post_stub.calls[0])
def test_logged_query_successful_log_failure(): okstub = StubCallable() failstub = StubCallable(throw=NotImplementedError('foo')) lgr = loggers.CallHomeStatusLogger(post=failstub) with loggers.logged_query(logger=lgr, **THE_USUAL): okstub('inside') assert 1 == len(okstub.calls) assert "[(('inside',), {})]" == str(okstub.calls) # There will have been a failure on another thread, # and it will have been ignored. time.sleep(0.2) # To let the call-home thread run, so this is less flaky. assert 1 == len(failstub.calls) check_logcall(failstub.calls[0])
def as_population_method(self, *args, **kwargs): with logged_query(query_string=fn.__code__.co_name, bindings=(args, kwargs), name=self.session_capture_name): self.check_representation() (dargs, dkwargs) = apply_argspec_transforms(self, xfrms, args, kwargs) result = None try: result = fn(*dargs, **dkwargs) except: self.logger.exception("") raise self.check_representation() return result
def test_logged_query_fail(): failstub = StubCallable(throw=NotImplementedError('foo')) okstub = StubCallable() lgr = loggers.CallHomeStatusLogger(post=okstub) try: with loggers.logged_query(logger=lgr, **THE_USUAL): failstub('die') assert False except NotImplementedError: pass time.sleep(0.2) # To let the call-home thread run, so this is less flaky. assert 1 == len(okstub.calls) check_logcall(okstub.calls[0]) assert 1 == len(failstub.calls) assert "[(('die',), {})]" == str(failstub.calls)
def quick_explore_vars(self, vars, nsimilar=20, plotfile='explore_vars'): """Show dependence probabilities and neighborhoods based on those. vars: list of strings At least two column names to look at dependence probabilities of, and to explore neighborhoods of. nsimilar: positive integer The size of the neighborhood to explore. plotfile: string pathname Where to save plots, if not displaying them on console. """ self.check_representation() with logged_query(query_string='quick_explore_vars', bindings=(vars,), name=self.session_capture_name): if len(vars) < 2: raise BLE(ValueError('Need to explore at least two variables.')) self.pairplot_vars(vars) query_columns = '''"%s"''' % '''", "'''.join(vars) deps = self.query('''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %s FOR %s;''' % (self.generator_name, query_columns)) deps.columns = ['genid', 'name0', 'name1', 'value'] self.heatmap(deps, plotfile=plotfile) deps.columns = ['genid', 'name0', 'name1', 'value'] triangle = deps[deps['name0'] < deps['name1']] triangle = triangle.sort_values(ascending=False, by=['value']) self.logger.result("Pairwise dependence probability for: %s\n%s\n\n", query_columns, triangle) for col in vars: neighborhood = self.query( '''ESTIMATE *, DEPENDENCE PROBABILITY WITH "%s" AS "Probability of Dependence with %s" FROM COLUMNS OF %s ORDER BY "Probability of Dependence with %s" DESC LIMIT %d;''' % (col, col, self.generator_name, col, nsimilar)) neighbor_columns = ('''"%s"''' % '''", "'''.join(neighborhood["name"].tolist())) deps = self.query('''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %s FOR %s;''' % (self.generator_name, neighbor_columns)) deps.columns = ['genid', 'name0', 'name1', 'value'] self.heatmap(deps, plotfile=(plotfile + "-" + col)) self.logger.result("Pairwise dependence probability of %s with its " + "strongest dependents:\n%s\n\n", col, neighborhood)
def test_logged_query_reporting_timeout(): okstub = StubCallable() slumbertime = 1 slowstub = StubCallable(sleep=slumbertime) start_time = time.time() lgr = loggers.CallHomeStatusLogger(post=slowstub) with loggers.logged_query(logger=lgr, **THE_USUAL): okstub('inside') elapsed_time = time.time() - start_time time.sleep(0.2) # To let the call-home thread run, so this is less flaky. # Success after done. assert 1 == len(okstub.calls) assert "(('inside',), {})" == str(okstub.calls[0]) assert elapsed_time < .9 * slumbertime # Shouldn't even be close. # But the call should have registered already: assert 1 == len(slowstub.calls) check_logcall(slowstub.calls[0])
def pairplot(self, cols, plotfile=None, colorby=None, **kwargs): """Wrap bdbcontrib.plot_utils.pairplot to show the given columns. Specifies bdb, query with the given columns, and generator_name: bdbcontrib_pairplot """ if len(cols) < 1: raise ValueError('Pairplot at least one variable.') qcols = cols if colorby is None else set(cols + [colorby]) query_columns = '''"%s"''' % '''", "'''.join(qcols) with logged_query(query_string='pairplot cols=?', bindings=(query_columns,), name=self.session_capture_name): self.logger.plot(plotfile, bdbcontrib.pairplot(self.bdb, '''SELECT %s FROM %s''' % (query_columns, self.name), generator_name=self.generator_name, colorby=colorby, **kwargs))
def heatmap(self, deps, selectors=None, plotfile=None, **kwargs): '''Show heatmaps for the given dependencies Parameters ---------- deps : pandas.DataFrame(columns=['generator_id', 'name0', 'name1', 'value']) The result of a .q('ESTIMATE ... PAIRWISE ...') E.g., DEPENDENCE PROBABILITY, MUTUAL INFORMATION, COVARIANCE, etc. selectors : {str: lambda name --> bool} Rather than plot the full NxN matrix all together, make separate plots for each combination of these selectors, plotting them in sequence. If selectors are specified, the actual selector functions are values of a dict, and the keys are their names, for purposes of plot legends and filenames. E.g., {'A-E': lambda x: bool(re.search(r'^[a-eA-E]', x[0])), 'F-O': lambda x: bool(re.search(r'^[f-oF-O]', x[0])), 'P-Z': lambda x: bool(re.search(r'^[p-zP-Z]', x[0]))} plotfile : str If a plotfile is specified, savefig to that file. If selectors are also specified, savefig to name1.name2.plotfile. **kwargs : dict Passed to zmatrix: vmin, vmax, row_ordering, col_ordering ''' self.check_representation() with logged_query(query_string='heatmap(deps, selectors)', bindings=(str(deps), repr(selectors)), name=self.session_capture_name): hmap = plt.figure() if selectors is None: cmap = bdbcontrib.plot_utils.heatmap(self.bdb, df=deps, **kwargs) self.logger.plot(plotfile, cmap) else: selfns = [selectors[k] for k in sorted(selectors.keys())] reverse = dict([(v, k) for (k, v) in selectors.items()]) for (cmap, sel1, sel2) in bdbcontrib.plot_utils.selected_heatmaps( self.bdb, df=deps, selectors=selfns, **kwargs): self.logger.plot("%s.%s.%s" % ( reverse[sel1], reverse[sel2], plotfile), cmap) return hmap
def as_population_method(self, *args, **kwargs): with logged_query(query_string=fn.__code__.co_name, bindings=(args, kwargs), name=self.session_capture_name): self.check_representation() (dargs, dkwargs) = apply_argspec_transforms(self, xfrms, args, kwargs) result = None try: result = fn(*dargs, **dkwargs) except: self.logger.exception("") raise from matplotlib import pyplot if isinstance(result, pyplot.Figure): self.logger.plot(result, kwargs.get('plotfile', None)) else: self.logger.info(result) self.check_representation() return result
def quick_similar_rows(self, identify_row_by, nsimilar=10): """Explore rows similar to the identified one. identify_row_by : dict Dictionary of column names to their values. These will be turned into a WHERE clause in BQL, and must identify one unique row. nsimilar : positive integer The number of similar rows to retrieve. """ self.check_representation() with logged_query(query_string='quick_similar_rows(id_by, n)', bindings=(identify_row_by, nsimilar), name=self.session_capture_name): import hashlib table_name = 'tmptbl_' + hashlib.md5('\x00'.join( [repr(identify_row_by), str(self.status)])).hexdigest() column_name = 'similarity_to_' + "__".join( re.sub(r'\W', '_', str(val)) for val in identify_row_by.values()) query_params = [] query_columns = [] for k, v in identify_row_by.iteritems(): query_columns.append('''%s = ? ''' % bayeslite.bql_quote_name(k)) query_params.append(v) query_attrs = ' and '.join(query_columns) with self.bdb.savepoint(): row_exists = self.query('SELECT COUNT(*) FROM %s WHERE %s;' % (self.name, query_attrs)) if row_exists.ix[0][0] != 1: raise BLE(NotImplementedError( 'identify_row_by found %d rows instead of exactly 1 in %s.' % (row_exists.ix[0][0], self.csv_path))) creation_query = ('''CREATE TEMP TABLE IF NOT EXISTS %s AS ESTIMATE *, SIMILARITY TO (%s) AS %s FROM %%g LIMIT %d;''' % (table_name, query_attrs, column_name, nsimilar)) self.query(creation_query, query_params) result = self.query('''SELECT * FROM %s ORDER BY %s DESC;''' % (table_name, column_name)) return result
def test_logged_query_dataframe(): from pandas import DataFrame df = DataFrame({'a': [1, 2.3, -4], # complex(4, -5)], # Complex is broken, even with the default # handler special case. # See https://github.com/pydata/pandas/issues/12554 'b': [float('nan'), None, 'N/A']}) query_stub = StubCallable() post_stub = StubCallable() lgr = loggers.CallHomeStatusLogger(post=post_stub) with loggers.logged_query(logger=lgr, query_string='q', bindings= (df,), name='n'): query_stub('inside') assert 1 == len(query_stub.calls) assert "(('inside',), {})" == str(query_stub.calls[0]) time.sleep(0.2) # To let the call-home thread run, so this is less flaky. assert 1 == len(post_stub.calls) posted = post_stub.calls[0][1]['data']['session_json'] data = json.loads(posted) df = data['entries'][0][2][1] assert ['{"a":{"0":1.0,"1":2.3,"2":-4.0},' # {"mathjs":"Complex","re":4,"im":-5}},' Complex broken. See above. '"b":{"0":null,"1":null,"2":"N\/A"}}'] == df
def __init__(self, name, csv_path=None, bdb_path=None, df=None, logger=None, session_capture_name=None): """Create a Population object, wrapping a bayeslite.BayesDB. name : str REQUIRED. A name for the population, should use letters and underscores only. This will also be used as a table name in the bdb, and %t in queries will expand to this name. %g in queries will expand to the current population metamodel, also based on this name. csv_path : str The path to a comma-separated values file. If specified, will be used to populate the bdb. It must exist and be both readable and non-empty. df : pandas.DataFrame If specified, these data will be used to populate the bdb, superseding any csv_path. It must not be empty. bdb_path : str If specified, store data and analysis results here. If no other data source (csv or df) is specified, then it must already have been populated. If not specified, we will use a volatile in-memory bdb. logger : object Something on which we can call .info or .warn to send messages to the user. By default a bayeslite.loggers.BqlLogger, but could be QuietLogger (only results), SilentLogger (nothing), IpyLogger, CaptureLogger, LoggingLogger, or anything else that implements the BqlLogger interface. session_capture_name : String Signing up with your name and email and sending your session details to the MIT Probabilistic Computing Group helps build a community of support and helps improve your user experience. You can save your choice in a file called 'bayesdb-session-capture-opt.txt' in the directory where you run the software, or any parent directory. This option overrides any setting in such a file. Any string is interpreted as opting in to sending session details. False is interpreted as opting out. You must choose. If you choose to use an organization name or email, then please send a note to [email protected] to help us connect your sessions to you. If you encounter a bug, or something surprising, please include your session capture name in your report. If you opt out, you still allow us to count how often users opt out. DO NOT USE THIS SOFTWARE FOR HIPAA-COVERED, PERSONALLY IDENTIFIABLE, OR SIMILARLY SENSITIVE DATA! Opting out does not guarantee security. """ Population.method_imports() assert re.match(r'\w+', name) assert df is not None or csv_path or bdb_path self.name = name self.generator_name = name + '_cc' # Because we use the default metamodel. self.csv_path = csv_path self.df = df self.bdb_path = bdb_path if logger is None: if 'IPython' in sys.modules: from bdbcontrib.loggers import IPYTHON_LOGGER as ipy self.logger = ipy else: self.logger = BqlLogger() else: self.logger = logger self.bdb = None self.status = None self.session_capture_name = None self.generators = [] with logged_query('count-beacon', None, name='count-beacon'): self.initialize_session_capture(session_capture_name) self.initialize()
def q(self, query_string, *bindings): '''help_for_query''' with logged_query(query_string, bindings, name=self.session_capture_name): return self.query(query_string, *bindings)