def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('OrbitalMechanics can only target one ' 'column. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE( ValueError('OrbitalMechanics can only target a NUMERICAL ' 'column. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the conditions column. if len(conditions) != 2: raise BLE( ValueError('OrbitalMechanics can only condition on ' 'two columns. Received {}'.format(conditions))) if any(c[1].lower() != 'numerical' for c in conditions): raise BLE( ValueError( 'OrbitalMechanics can only condition on ' 'NUMERICAL columns. Received {}'.format(conditions))) self.conditions = [c[0] for c in conditions] # The dataset. self.dataset = df[self.conditions + self.targets].dropna() X = self.dataset[self.conditions].as_matrix() # Learn the noise model. actual_period = self.dataset[self.targets].as_matrix().ravel() theoretical_period = satellite_period_minutes(X[:, 0], X[:, 1]) errors = np.abs(actual_period - theoretical_period) error_95 = np.percentile(errors, 95) errors = np.mean(np.select([errors < error_95], [errors])) self.noise = np.sqrt(np.mean(errors**2))
def gen_hilight_colors(hl_labels=None, hl_colors=None): """Generates a hilight color lookup from labels to colors. Generates labels from Set1 by default. """ if hl_labels is None: return {} hl_colors_out = {} if hl_colors is None: if len(hl_labels) > 0: hl_max = float(len(hl_labels)) hl_colors_out = dict([(i, plt.cm.Set1(i/hl_max)) for i in hl_labels]) else: if isinstance(hl_colors, list): hl_colors_out = dict(zip(hl_labels, hl_colors)) elif isinstance(hl_colors, dict): for key in hl_colors.keys(): if key not in hl_labels: raise BLE(ValueError( 'hl_colors dict must have an entry ' 'for each hl_label.')) hl_colors_out = hl_colors else: raise BLE(TypeError('hl_colors must be a list or dict.')) if len(hl_colors) != len(hl_labels): raise BLE(ValueError('hl_colors must have an entry for each ' 'entry in hl_labels.')) return hl_colors_out
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE( ValueError('MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError( 'MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE( ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the random forest. self._train_rf()
def _compute_targets_distribution(self, conditions): """Given conditions dict {feature_col:val}, returns the distribution and (class mapping for lookup) of the random label self.targets|conditions. """ if not set(self.conditions).issubset(set(conditions.keys())): raise BLE( ValueError('Must specify values for all the conditionals.\n' 'Received: {}\n' 'Expected: {}'.format( conditions, self.conditions_numerical + self.conditions_categorical))) # Are there any category values in conditions which never appeared during # training? If yes, we need to run the partial RF. unseen = any([ conditions[cat] not in self.categories_to_val_map[cat] for cat in self.conditions_categorical ]) X_numerical = [conditions[col] for col in self.conditions_numerical] if unseen: distribution = self.rf_partial.predict_proba(X_numerical) classes = self.rf_partial.classes_ else: X_categorical = [ conditions[col] for col in self.conditions_categorical ] X_categorical = utils.binarize_categorical_row( self.conditions_categorical, self.categories_to_val_map, X_categorical) distribution = self.rf_full.predict_proba( np.hstack((X_numerical, X_categorical))) classes = self.rf_partial.classes_ return distribution[0], classes
def pairplot_vars(bdb, varnames, colorby=None, generator_name=None, population_name=None, **kwargs): """Use pairplot to show the given variables. See help(pairplot) for more plot options. Parameters ---------- bdb: __population_to_bdb__ varnames: list of one or more variables to plot. generator_name: __generator_name__ population_name: __population_name__ colorby: categorical variable to color all of the plots by. Returns ------- figure: a matplotlib.figure.Figure """ if len(varnames) < 1: raise BLE(ValueError('Pairplot at least one variable.')) qvars = varnames if colorby is None else set(varnames + [colorby]) query_columns = '''"%s"''' % '''", "'''.join(qvars) bql = '''SELECT %s FROM %s''' % (query_columns, population_name) df = bqlu.query(bdb, bql) return pairplot(bdb, df, generator_name=generator_name, colorby=colorby, **kwargs)
def initialize_session_capture(self, name): if self.session_capture_name is not None: return if name is not None: self.session_capture_name = name return # Search for a session-capture name or opt-out saved as a file: searchdir = os.getcwd() while searchdir != os.path.dirname(searchdir): # While not at root. try: with open(os.path.join(searchdir, OPTFILE), 'r') as optinfile: self.session_capture_name = optinfile.read() if self.session_capture_name == 'False': self.session_capture_name = False break except IOError: pass searchdir = os.path.dirname(searchdir) # No init option specified, no choice file found. Force the choice. if self.session_capture_name is None: raise BLE( "Please set session_capture_name option to Population.__init__\n" " to either opt-in or opt-out of sending details of your usage of\n" " this software to the MIT Probabilistic Computing Group.\n\n" "If you see this in one of our example notebooks,\n" " return to the starting page, the Index.ipynb, to\n" " make that choice.")
def barplot(bdb, df): """Make bar-plot from categories and their heights. First column specifies names; second column specifies heights. Parameters ---------- bdb : __population_to_bdb__ df : __specifier_to_df__ Returns ---------- figure: matplotlib.figure.Figure """ if df.shape[1] != 2: raise BLE( ValueError('Need two columns of output from SELECT for barplot.')) height_inches = df.shape[0] / 2.0 figure, ax = plt.subplots(figsize=(height_inches, 5)) ax.bar([x - .5 for x in range(df.shape[0])], df.ix[:, 1].values, color='#333333', edgecolor='#333333') ax.set_xticks(range(df.shape[0])) ax.set_xticklabels(df.ix[:, 0].values, rotation=90) ax.set_xlim([-1, df.shape[0] - .5]) ax.set_ylabel(df.columns[1]) ax.set_xlabel(df.columns[0]) return figure
def draw_crosscat(bdb, generator, modelno, row_label_col=None): """Draw crosscat model from the specified generator. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. generator_name : str Name of generator. modelno: int Number of model to draw. Returns ---------- figure: matplotlib.figure.Figure """ bql = ''' SELECT tabname, metamodel FROM bayesdb_generator WHERE name = ? ''' cursor = bdb.execute(bql, (generator,)) table_name, metamodel = cursor.next() if metamodel.lower() != 'crosscat': raise BLE(ValueError( 'Metamodel for generator %s (%s) should be crosscat' % (generator, metamodel))) figure, axes = plt.subplots(tight_layout=False) draw_state(bdb, table_name, generator, modelno, ax=axes, row_label_col=row_label_col) return figure
def do_hist(data_srs, **kwargs): ax = kwargs.get('ax', None) bdb = kwargs.get('bdb', None) dtype = kwargs.get('dtype', None) show_contour = kwargs.get('show_contour', None) generator_name = kwargs.get('generator_name', None) colors = kwargs.get('colors', None) if dtype is None: dtype = get_bayesdb_col_type(data_srs.columns[0], data_srs, bdb=bdb, generator_name=generator_name) if ax is None: ax = plt.gca() if len(data_srs.shape) > 1: if colors is None and data_srs.shape[1] != 1: raise BLE( ValueError('If a dummy column is specified,' ' colors must also be specified.')) data_srs = data_srs.dropna() if dtype == 'categorical': vals, uvals, _ = conv_categorical_vals_to_numeric(data_srs.ix[:, 0]) if colors is not None: color_lst = [] stacks = [] for val, color in colors.iteritems(): subval = vals[data_srs.ix[:, 1].values == val] color_lst.append(color) stacks.append(subval) ax.hist(stacks, bins=len(uvals), color=color_lst, alpha=.9, histtype='barstacked', rwidth=1.0) else: ax.hist(vals, bins=len(uvals)) ax.set_xticks(range(len(uvals))) ax.set_xticklabels(uvals) else: do_kde = show_contour if colors is not None: for val, color in colors.iteritems(): subdf = data_srs.loc[data_srs.ix[:, 1] == val] values = drop_inf_and_nan(subdf.ix[:, 0]) if len(values) < 2: # Then seaborn would break. :-p continue bins = seaborn_broken_bins(values) sns.distplot(values, kde=do_kde, ax=ax, color=color, bins=bins) else: values = drop_inf_and_nan(data_srs) bins = seaborn_broken_bins(values) sns.distplot(values, kde=do_kde, ax=ax, bins=bins) return ax
def logpdf(self, value, conditions): if not set(self.conditions).issubset(set(conditions.keys())): raise BLE( ValueError('Must specify values for all the conditionals.\n' 'Received: {}\n' 'Expected: {}'.format(conditions, self.conditions))) apogee_km, perigee_km = self._conditions(conditions) period_minutes = satellite_period_minutes(apogee_km, perigee_km) return logpdfGaussian(value, period_minutes, self.noise)
def gen_collapsed_legend_from_dict(hl_colors_dict, loc=0, title=None, fontsize='medium', wrap_threshold=1000): """Creates a legend with entries grouped by color. For example, if a plot has multiple labels associated with the same color line, instead of generating a legend entry for each label, labels with the same colored line will be collapsed into longer, comma-separated labels. Parameters ---------- hl_colors_dict : dict A dict of label, color pairs. Colors can be strings e.g. 'deeppink' or rgb or rgba tuples. loc : matplotlib compatible any matpltotlib-compbatible legend location identifier title : str legend title fontsize : int legend entry and title fontsize wrap_threshold : int max number of characters before wordwrap Returns ------- legend : matplotlib.legend """ if not isinstance(hl_colors_dict, dict): raise BLE(TypeError("hl_colors_dict must be a dict")) colors = list(set(hl_colors_dict.values())) from collections import defaultdict collapsed_dict = defaultdict(list) for label, color in hl_colors_dict.iteritems(): collapsed_dict[color].append(str(label)) for color in collapsed_dict.keys(): collapsed_dict[color] = "\n".join( wrap(", ".join(sorted(collapsed_dict[color])), wrap_threshold)) legend_artists = [] legend_labels = [] for color, label in collapsed_dict.iteritems(): legend_artists.append(plt.Line2D((0, 1), (0, 0), color=color, lw=3)) legend_labels.append(label) legend = plt.legend(legend_artists, legend_labels, loc=loc, title=title, fontsize=fontsize) return legend
def plot_crosscat_chain_diagnostics(bdb, diagnostic, generator): """Plot diagnostics for all models of generator. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. diagnostic : str Valid (crosscat) diagnostics are: - logscore: log score of the model - num_views: the number of views in the model - column_crp_alpha: CRP alpha over columns generator : str Name of the generator to diagnose. Returns ---------- figure: matplotlib.figure.Figure """ valid_diagnostics = ['logscore', 'num_views', 'column_crp_alpha'] if diagnostic not in valid_diagnostics: raise BLE(ValueError('Unknown diagnostic %s.\n' 'Please choose one of the following instead: %s\n' % ', '.join(valid_diagnostics))) generator_id = bayeslite.core.bayesdb_get_generator(bdb, generator) # Get model numbers. Do not rely on there to be a diagnostic for every # model. bql = ''' SELECT modelno, COUNT(modelno) FROM bayesdb_crosscat_diagnostics WHERE generator_id = ? GROUP BY modelno ''' df = bu.cursor_to_df(bdb.execute(bql, (generator_id,))) models = df['modelno'].astype(int).values figure, ax = plt.subplots(tight_layout=True, figsize=(10, 5)) colors = sns.color_palette("GnBu_d", len(models)) for i, modelno in enumerate(models): bql = ''' SELECT {}, iterations FROM bayesdb_crosscat_diagnostics WHERE modelno = ? AND generator_id = ? ORDER BY iterations ASC '''.format(diagnostic) df = bu.cursor_to_df(bdb.execute(bql, (modelno, generator_id))) ax.plot(df['iterations'].values, df[diagnostic].values, c=colors[modelno], alpha=.7, lw=2) ax.text(df['iterations'].values[-1], df[diagnostic].values[-1], str(modelno), color=colors[i]) ax.set_xlabel('Iteration') ax.set_ylabel(diagnostic) ax.set_title('%s for each model in %s' % (diagnostic, generator,)) return figure
def simulate(self, n_samples, conditions): if not set(self.conditions).issubset(set(conditions.keys())): raise BLE( ValueError('Must specify values for all the conditionals.\n' 'Received: {}\n' 'Expected: {}'.format(conditions, self.conditions))) apogee_km, perigee_km = self._conditions(conditions) period_minutes = satellite_period_minutes(apogee_km, perigee_km) return list(period_minutes + self.prng.normal(scale=self.noise, size=n_samples))
def initialize(self): if self.bdb: self.check_representation() return self.bdb = bayeslite.bayesdb_open(self.bdb_path) if not bayeslite.core.bayesdb_has_table(self.bdb, self.name): if self.df is not None: bayeslite.read_pandas.bayesdb_read_pandas_df(self.bdb, self.name, self.df, create=True, ifnotexists=True) elif self.csv_path: bayeslite.bayesdb_read_csv_file(self.bdb, self.name, self.csv_path, header=True, create=True, ifnotexists=True) else: tables = self.list_tables() metamodels = self.list_metamodels() if len(tables) + len(metamodels) == 0: raise BLE( ValueError( "No data sources specified, and an empty bdb.")) else: raise BLE( ValueError( "The name of the population must be the same" " as a table in the bdb, one of: " + ", ".join(tables) + "\nNote also that the bdb has the following" " metamodels defined: " + ", ".join(metamodels))) self.generators = self.query('''SELECT * FROM bayesdb_generator''') if len(self.generators) == 0: size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0] assert 0 < size self.query(''' CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''') self.check_representation()
def get_metadata(bdb, generator_name, modelno): generator_id = bayeslite.core.bayesdb_get_generator(bdb, generator_name) sql = ''' SELECT theta_json FROM bayesdb_crosscat_theta WHERE generator_id = ? and modelno = ? ''' cursor = bdb.sql_execute(sql, (generator_id, modelno)) try: row = cursor.next() except StopIteration: raise BLE(ValueError('Could not find generator with ' 'name {}, or incorrect model number.'.format(generator_name))) else: return json.loads(row[0])
def get_M_c(bdb, generator_name): generator_id = bayeslite.core.bayesdb_get_generator(bdb, generator_name) sql = ''' SELECT metadata_json FROM bayesdb_crosscat_metadata WHERE generator_id = ? ''' cursor = bdb.sql_execute(sql, (generator_id,)) try: row = cursor.next() except StopIteration: raise BLE(ValueError(bdb, 'No crosscat metadata for generator: %s' % (generator_name,))) else: return json.loads(row[0])
def quick_explore_vars(self, varnames, plotfile=None, nsimilar=20): """Show dependence probabilities and neighborhoods based on those. varnames: list of strings At least two column names to look at dependence probabilities of, and to explore neighborhoods of. nsimilar: positive integer The size of the neighborhood to explore. """ if len(varnames) < 2: raise BLE(ValueError('Need to explore at least two variables.')) self.pairplot_vars(varnames) query_columns = '''"%s"''' % '''", "'''.join(varnames) with self.bdb.savepoint(): temptab = self.bdb.temp_table_name() self.query(''' CREATE TEMP TABLE %s AS ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %s FOR %s ''' % (temptab, self.generator_name, query_columns)) deps = self.query('SELECT * FROM %s' % (temptab, )) deps.columns = ['genid', 'name0', 'name1', 'value'] triangle = self.query(''' SELECT * FROM %s WHERE name0 < name1 ORDER BY value DESC ''' % (temptab, )) triangle.columns = ['genid', 'name0', 'name1', 'value'] if plotfile: self.logger.plot(plotfile + '-deps', self.heatmap(deps)) self.logger.result("Pairwise dependence probability for: %s\n%s\n\n", query_columns, triangle) for col in varnames: neighborhood = self.query( '''ESTIMATE *, DEPENDENCE PROBABILITY WITH "%s" AS "Probability of Dependence with %s" FROM COLUMNS OF %s ORDER BY "Probability of Dependence with %s" DESC LIMIT %d;''' % (col, col, self.generator_name, col, nsimilar)) neighbor_columns = ('''"%s"''' % '''", "'''.join(neighborhood["name"].tolist())) deps = self.query('''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %s FOR %s;''' % (self.generator_name, neighbor_columns)) deps.columns = ['genid', 'name0', 'name1', 'value'] if plotfile: self.logger.plot(plotfile + "-" + col, self.heatmap(deps)) self.logger.result( "Pairwise dependence probability of %s with its " + "strongest dependents:\n%s\n\n", col, neighborhood)
def variable_stattypes(bdb, generator_name=None): assert generator_name """The modeled statistical types of each variable in order.""" if not bayeslite.core.bayesdb_has_generator_default(bdb, generator_name): raise BLE(NameError('No such generator {}'.format(generator_name))) sql = ''' SELECT c.colno AS colno, c.name AS name, gc.stattype AS stattype FROM bayesdb_generator AS g, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_generator_column AS gc USING (colno)) WHERE g.id = ? AND g.id = gc.generator_id AND g.tabname = c.tabname ORDER BY colno ASC; ''' generator_id = bayeslite.core.bayesdb_get_generator_default( bdb, generator_name) curs = bdb.sql_execute(sql, bindings=(generator_id, )) return cursor_to_df(curs)
def describe_generator(bdb, generator_name): """Returns a DataFrame containing description of `generator_name`. Examples -------- >>> bdbcontrib.describe_generator(bdb, 'employees_gen') id | name | tabname | metamodel ---+---------------+-----------+---------- 3 | employees_gen | employees | crosscat """ if not bayeslite.core.bayesdb_has_generator_default(bdb, generator_name): raise BLE(NameError('No such generator {}'.format(generator_name))) sql = ''' SELECT id, name, tabname, metamodel FROM bayesdb_generator WHERE name = ? ''' curs = bdb.sql_execute(sql, bindings=(generator_name, )) return cursor_to_df(curs)
def _compute_targets_distribution(self, conditions): """Given conditions dict {feature_col:val}, returns the conditional mean of the `targets`, and the scale of the Gaussian noise. """ if not set(self.conditions).issubset(set(conditions.keys())): raise BLE( ValueError('Must specify values for all the conditionals.\n' 'Received: {}\n' 'Expected: {}'.format( conditions, self.conditions_numerical + self.conditions_categorical))) # Are there any category values in conditions which never appeared during # training? If yes, we need to run the partial RF. unseen = any([ conditions[cat] not in self.categories_to_val_map[cat] for cat in self.conditions_categorical ]) X_numerical = [conditions[col] for col in self.conditions_numerical] if unseen: inputs = np.array([X_numerical]) assert inputs.shape == (1, len(self.conditions_numerical)) predictions = self.mr_partial.predict(inputs) noise = self.mr_partial_noise else: X_categorical = [ conditions[col] for col in self.conditions_categorical ] X_categorical = utils.binarize_categorical_row( self.conditions_categorical, self.categories_to_val_map, X_categorical) inputs = np.concatenate(([X_numerical], [X_categorical]), axis=1) assert inputs.shape == \ (1, len(self.conditions_numerical) + len(X_categorical)) predictions = self.mr_full.predict(inputs) noise = self.mr_full_noise return predictions[0], noise
def describe_generator_models(bdb, generator_name): """Returns a DataFrame containing description of the models in `generator_name`. Examples -------- >>> bdbcontrib.describe_generator_models(bdb, 'employees_gen') modelno | iterations --------+----------- 0 | 100 """ if not bayeslite.core.bayesdb_has_generator_default(bdb, generator_name): raise BLE(NameError('No such generator {}'.format(generator_name))) sql = ''' SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? ''' generator_id = bayeslite.core.bayesdb_get_generator_default( bdb, generator_name) curs = bdb.sql_execute(sql, bindings=(generator_id, )) return cursor_to_df(curs)
def get_bayesdb_col_type(column_name, df_column, bdb=None, generator_name=None): # If column_name is a column label (not a short name!) then the modeltype # of the column will be returned otherwise we guess. if isinstance(df_column, pd.DataFrame): raise BLE( TypeError( 'Multiple columns in the query result have the same name (%s).' % (column_name, ))) def guess_column_type(df_column): pd_type = df_column.dtype if pd_type is str or pd_type == np.object: return 'categorical' else: if len(df_column.unique()) < 30: return 'categorical' else: return 'numerical' if bdb is not None and generator_name is not None: try: coltype = bqlu.get_column_stattype(bdb, generator_name, column_name) # XXX: Force cyclic -> numeric because there is no need to plot # cyclic data any differently until we implement rose plots. See # http://matplotlib.org/examples/pie_and_polar_charts/polar_bar_demo.html # for an example. if coltype.lower() == 'cyclic': coltype = 'numerical' return coltype except IndexError: return guess_column_type(df_column) else: return guess_column_type(df_column)
def describe_table(bdb, table_name): """Returns a DataFrame containing description of `table_name`. Examples -------- >>> bdbcontrib.describe_table(bdb, 'employees') tabname | colno | name ----------+-------+-------- employees | 0 | name employees | 1 | age employees | 2 | weight employees | 3 | height """ if not bayeslite.core.bayesdb_has_table(bdb, table_name): raise BLE(NameError('No such table {}'.format(table_name))) sql = ''' SELECT tabname, colno, name FROM bayesdb_column WHERE tabname=? ORDER BY tabname ASC, colno ASC ''' curs = bdb.sql_execute(sql, bindings=(table_name, )) return cursor_to_df(curs)
def quick_similar_rows(self, identify_row_by, nsimilar=10): """Explore rows similar to the identified one. identify_row_by : dict Dictionary of column names to their values. These will be turned into a WHERE clause in BQL, and must identify one unique row. nsimilar : positive integer The number of similar rows to retrieve. """ import hashlib table_name = 'tmptbl_' + hashlib.md5('\x00'.join( [repr(identify_row_by), str(self.status)])).hexdigest() column_name = 'similarity_to_' + "__".join( re.sub(r'\W', '_', str(val)) for val in identify_row_by.values()) query_params = [] query_columns = [] for k, v in identify_row_by.iteritems(): query_columns.append('''%s = ? ''' % bayeslite.bql_quote_name(k)) query_params.append(v) query_attrs = ' and '.join(query_columns) with self.bdb.savepoint(): row_exists = self.query('SELECT COUNT(*) FROM %s WHERE %s;' % (self.name, query_attrs)) if row_exists.ix[0][0] != 1: raise BLE( NotImplementedError( 'identify_row_by found %d rows instead of exactly 1 in %s.' % (row_exists.ix[0][0], self.csv_path))) creation_query = ('''CREATE TEMP TABLE IF NOT EXISTS %s AS ESTIMATE *, SIMILARITY TO (%s) AS %s FROM %%g LIMIT %d;''' % (table_name, query_attrs, column_name, nsimilar)) self.query(creation_query, query_params) result = self.query('''SELECT * FROM %s ORDER BY %s DESC;''' % (table_name, column_name)) return result
def _pairplot(df, bdb=None, generator_name=None, stattypes=None, show_contour=False, colorby=None, show_missing=False, show_full=False, **kwargs): """Plots the columns in data_df in a facet grid. Supports the following pairs: - categorical-categorical pairs are displayed as a heatmap - continuous-continuous pairs are displayed as a kdeplot - categorical-continuous pairs are displayed on a violin plot Parameters ---------- df : pandas.DataFrame The input data---the result of a BQL/SQL query bdb : bayeslite.BayesDB (optional) The BayesDB object associated with `df`. Having the BayesDB object and the generator for the data allows pairplot to choose plot types. generator_name : str The name of generator associated with `df` and `bdb`. stattypes : dict, optional {column_name: "categorical"|"numerical"} If you do not specify a generator name, have a column that the generator doesn't know about, or would like to override the statistical types the generator has for a given variable, then pass this dict of column names to types. show_contour : bool If True, KDE contours are plotted on top of scatter plots and histograms. show_missing : bool If True, rows with one missing value are plotted as lines on scatter plots. colorby : str Name of a column to use to color data points in histograms and scatter plots. show_full : bool Show full pairwise plots, rather than only lower triangular plots. kwargs : dict Options to pass through to underlying plotting function (for pairs). Returns ------- figure : matplotlib.figure.Figure A num_columns by num_columns Gridspec of pairplot axes. Notes ----- Support soon for ordered continuous combinations. It may be best to plot all ordered continuous pairs as heatmap. """ # NOTE:Things to consider: # - String values are a possibility (categorical) # - who knows what the columns are named. What is the user selects columns # as shortname? # - where to handle dropping NaNs? Missing values may be informative. data_df = df if stattypes is None: stattypes = {} colors = None if colorby is not None: n_colorby = 0 for colname in data_df.columns: # XXX: This is not guaranteed to work on all Unicode characters. if colorby.lower() == colname.lower(): n_colorby += 1 colorby = colname if n_colorby == 0: raise BLE( ValueError('colorby column, {}, not found.'.format(colorby))) elif n_colorby > 1: raise BLE(ValueError( 'Multiple columns named, {}.'.format(colorby))) dummy = data_df[colorby].dropna() dvals = np.sort(dummy.unique()) ndvals = len(dvals) dval_type = "categorical" if colorby in stattypes: dval_type = stattypes[colorby] elif generator_name: dval_type = get_bayesdb_col_type(colorby, dummy, bdb=bdb, generator_name=generator_name) if dval_type.lower() != 'categorical': raise BLE(ValueError('colorby columns must be categorical.')) cmap = sns.color_palette("Set1", ndvals) colors = {} for val, color in zip(dvals, cmap): colors[val] = color all_varnames = [c for c in data_df.columns if c != colorby] n_vars = len(all_varnames) plt_grid = gridspec.GridSpec(n_vars, n_vars) figure = plt.figure() # if there is only one variable, just do a hist if n_vars == 1: ax = plt.gca() varname = data_df.columns[0] vartype = "categorical" if varname in stattypes: vartype = stattypes[varname] elif generator_name: vartype = get_bayesdb_col_type(varname, data_df[varname], bdb=bdb, generator_name=generator_name) do_hist(data_df, dtype=vartype, ax=ax, bdb=bdb, colors=colors) rotate_tick_labels(ax) return figure xmins = np.ones((n_vars, n_vars)) * float('Inf') xmaxs = np.ones((n_vars, n_vars)) * float('-Inf') ymins = np.ones((n_vars, n_vars)) * float('Inf') ymaxs = np.ones((n_vars, n_vars)) * float('-Inf') vartypes = [] for varname in all_varnames: vartype = "categorical" if varname in stattypes: vartype = stattypes[varname] elif generator_name: vartype = get_bayesdb_col_type(varname, data_df[varname], bdb=bdb, generator_name=generator_name) vartypes.append(vartype) # store each axes; reaccessing ax with plt.subplot(plt_grid[a,b]) may # overwrite the ax axes = [[] for _ in xrange(len(all_varnames))] for x_pos, var_name_x in enumerate(all_varnames): var_x_type = vartypes[x_pos] for y_pos, var_name_y in enumerate(all_varnames): var_y_type = vartypes[y_pos] ax = figure.add_subplot(plt_grid[y_pos, x_pos]) axes[y_pos].append(ax) if x_pos == y_pos: varnames = [var_name_x] if colorby is not None: varnames.append(colorby) ax = do_hist(data_df[varnames], dtype=var_x_type, ax=ax, bdb=bdb, colors=colors, show_contour=show_contour) else: varnames = [var_name_x, var_name_y] vartypes_pair = ( var_x_type, var_y_type, ) if colorby is not None: varnames.append(colorby) plot_df = prep_plot_df(data_df, varnames) ax = do_pair_plot(plot_df, vartypes_pair, ax=ax, bdb=bdb, show_contour=show_contour, show_missing=show_missing, colors=colors, **kwargs) ymins[y_pos, x_pos] = ax.get_ylim()[0] ymaxs[y_pos, x_pos] = ax.get_ylim()[1] xmins[y_pos, x_pos] = ax.get_xlim()[0] xmaxs[y_pos, x_pos] = ax.get_xlim()[1] ax.set_xlabel(var_name_x, fontweight='bold') ax.set_ylabel(var_name_y, fontweight='bold') for x_pos in range(n_vars): for y_pos in range(n_vars): ax = axes[y_pos][x_pos] # Self-histogram for x only, or comparative x against y: ax.set_xlim([np.min(xmins[:, x_pos]), np.max(xmaxs[:, x_pos])]) if x_pos != y_pos: ax.set_ylim([np.min(ymins[y_pos, :]), np.max(ymaxs[y_pos, :])]) # Y labels if x_pos > 0: if x_pos == n_vars - 1: # All the way to the right: ax.yaxis.tick_right() ax.yaxis.set_label_position('right') else: # No labels inside: ax.set_ylabel('') ax.set_yticklabels([]) else: ax.yaxis.tick_left() ax.yaxis.set_label_position('left') # X labels: if y_pos < n_vars - 1: if y_pos == 0: # At top, show x labels on top: ax.xaxis.tick_top() ax.xaxis.set_label_position('top') else: # No labels inside: ax.set_xlabel('') ax.set_xticklabels([]) else: ax.xaxis.tick_bottom() ax.xaxis.set_label_position('bottom') rotate_tick_labels(ax) def fake_axis_ticks(ax_tl, ax_tn): atl, btl = ax_tl.get_ylim() atn, btn = ax_tn.get_ylim() tnticks = ax_tn.get_yticks() yrange_tn = (btn - atn) yrange_tl = (btl - atl) tntick_ratios = [(t - atn) / yrange_tn for t in tnticks] ax_tl.set_yticks([r * yrange_tl + atl for r in tntick_ratios]) ax_tl.set_yticklabels(tnticks) # Fix the top-left histogram y-axis ticks and labels. if show_full: fake_axis_ticks(axes[0][0], axes[0][1]) fake_axis_ticks(axes[-1][-1], axes[-1][0]) if colorby is not None: legend = gen_collapsed_legend_from_dict(colors, title=colorby) legend.draggable() # tril by default by deleting upper diagonal axes. if not show_full: for y_pos in range(n_vars): for x_pos in range(y_pos + 1, n_vars): figure.delaxes(axes[y_pos][x_pos]) return figure
def draw_state(bdb, table_name, generator_name, modelno, ax=None, border_width=3, row_label_col=None, short_names=True, hilight_rows=[], hilight_rows_colors=None, hilight_cols=[], hilight_cols_colors=None, separator_color='black', separator_width=4, blank_state=False, nan_color=(1., 0., 0., 1.), view_labels=None, view_label_fontsize='large', legend=True, legend_fontsize='medium', row_legend_loc=1, row_legend_title='Row key', col_legend_loc=4, col_legend_title='Column key', descriptions_in_legend=True, legend_wrap_threshold=20,): """Creates a debugging (read: not pretty) rendering of a CrossCat state. Parameters ---------- bdb : bayeslite.BayesDB The BayesDB object associated with the CrossCat state table_name : str The btable name containing the data associated with the state generator_name : str The CrossCat generator associated witht the state modelno : int The index of the model/state to draw ax : matplotlib.axis The axis on which to draw row_label_col : str The name of the column to use for row labels. Defaults to FIXME short_names : bool Use shortnames as column labels hilight_rows : list<str> A list of rows to hilight with colored rectangles. hilight_rows_colors : list Contains a color (str or tuple) for each entry in `hilight_rows`. If not specified, unique colors for each entry are generated. hilight_cols : list A list of columns to hilight with colored rectangles. hilight_cols_colors : list Contains a color (str or tuple) for each entry in `hilight_cols`. If not specified, unique colors for each entry are generated. blank_state : bool If True, draws an unsorted, unpartitioned state view_labels : list<str> Labels placed above each view. If `len(view_labels) < num_views` then only the views for which there are entries are labeled. view_label_fontsize : valid matplotlib `fontsize` Font size used for vie labels legend : bool If True (defult) displays legend legend_fontsize : valid matplotlib `fontsize` Font size used for legend entries and titles row_legend_loc : matplotlib.legend location location of the row legend. For use with row hilighting col_legend_loc : matplotlib.legend location location of the column legend. For use with column hilighting Returns ------- ax : matplotlib.axis The state rendering Other Parameters ---------------- border_width : int The number of cells between views. Use larger values for longer row names. separator_color : str or (r, g, b) or (r, g, b, alpha) tuple The color of the cluster seprator. Default is black. separator_width : int linewidth of the cluster separator nan_color : str or (r, g, b) or (r, g, b, alpha) tuple The color for missing/NaN values. Default is red. row_legend_title : str title of the row legend col_legend_title : str title of the column legend legend_wrap_threshold : int Max number of characters until wordrap for collapsed legends. For use when multiple entries in `hilight_cols_colors` or `hilight_cols_colors` contain the same color. descriptions_in_legend : bool If True (default), the column descriptions (requires codebook) are added to the legend """ theta = get_metadata(bdb, generator_name, modelno) M_c = get_M_c(bdb, generator_name) # idx_to_name doesn't use an int idx, but a string idx because # crosscat. Yep. ordered_columns = [M_c['idx_to_name'][str(idx)] for idx in sorted(M_c['name_to_idx'].values())] T = bu.get_data_as_list(bdb, table_name, column_list=ordered_columns) X_L = theta['X_L'] X_D = theta['X_D'] num_rows = len(T) num_cols = len(T[0]) if not blank_state: sortedstate = DrawStateUtils.sort_state(X_L, X_D, M_c, T) sorted_views, sorted_clusters, sorted_cols, sorted_rows = sortedstate column_partition = X_L['column_partition']['assignments'] else: blankstate = DrawStateUtils.gen_blank_sort(num_rows, num_cols) sorted_views, sorted_clusters, sorted_cols, sorted_rows = blankstate column_partition = [0]*num_cols if view_labels is not None: if not isinstance(view_labels, list): raise BLE(TypeError("view_labels must be a list")) if len(view_labels) != len(sorted_views): view_labels += ['']*(len(sorted_rows)-len(view_labels)) else: view_labels = ['V ' + str(i) for i in range(num_rows)] if hilight_cols_colors is None: hilight_cols_colors = [] if hilight_rows_colors is None: hilight_rows_colors = [] # set colormap to 50% gray (should probably give the user control # over this) cmap = matplotlib.colors.ListedColormap([(1, 1, 1, 1), (1, 1, 1, 1)]) T = DrawStateUtils.convert_t_do_numerical(T, M_c) num_views = len(sorted_cols) X = np.zeros((num_rows, num_cols+num_views*border_width)) # row hilighting row_hl_colors = DrawStateUtils.gen_hilight_colors(hilight_rows, hilight_rows_colors) hl_row_idx_label_zip = [] if row_label_col is None: row_labels = [str(i) for i in range(num_rows)] elif isinstance(row_label_col, list): if len(row_label_col) != num_rows: raise BLE(TypeError("If row_label_col is a list, it must have an " "entry for each row")) row_labels = [str(label) for label in row_label_col] elif isinstance(row_label_col, str): # FIXME: This is not going to work until BayesDB stops removing key and # ignore columns from the data raise NotImplementedError label_col_idx = M_c['name_to_idx'][row_label_col] row_labels = [str(T[row, label_col_idx]) for row in range(num_rows)] else: raise BLE(TypeError("Unhandled row_label_col type {}.".format( type(row_label_col)))) row_idx_to_label = {} row_label_to_idx = {} for row, label in enumerate(row_labels): # XXX: Allows missing enries to be a column label row_idx_to_label[row] = label row_label_to_idx[label] = row for label in hilight_rows: hl_row_idx_label_zip.append((row_label_to_idx[label], label,)) # column hilighting col_hl_colors = DrawStateUtils.gen_hilight_colors(hilight_cols, hilight_cols_colors) hl_col_idx_label_zip = [] for label in hilight_cols: hl_col_idx_label_zip.append((M_c['name_to_idx'][label], label,)) # generate a heatmap using the data (allows clusters to ahve different # base colors) cell_colors = DrawStateUtils.gen_cell_colors(T, sorted_views, sorted_cols, sorted_clusters, sorted_rows, column_partition, cmap, border_width, nan_color=nan_color) # x_tick_labels = [] x_labels = [] if ax is None: ax = plt.gca() ax.imshow(cell_colors, cmap=cmap, interpolation='nearest', origin='upper', aspect='auto') col_count = 0 for v, view in enumerate(sorted_views): view_x_labels = [M_c['idx_to_name'][str(col)] for col in sorted_cols[view]] if short_names: view_x_tick_labels = bu.get_shortnames(bdb, table_name, view_x_labels) else: view_x_tick_labels = view_x_labels y_tick_labels = [] x_labels += view_x_labels + ['_']*border_width num_cols_view = len(sorted_cols[view]) sbplt_start = col_count+v*border_width sbplt_end = col_count+num_cols_view+v*border_width for i, vxtl in enumerate(view_x_labels): if vxtl in hilight_cols: edgecolor = col_hl_colors[vxtl] x_a = sbplt_start+i-.5 ax.add_patch(Rectangle((x_a, -.5), 1, num_rows, facecolor="none", edgecolor=edgecolor, lw=2, zorder=10)) fontcolor = edgecolor fontsize = 'x-small' else: fontcolor = '#333333' fontsize = 'x-small' font_kws = dict(color=fontcolor, fontsize=fontsize, rotation=90, va='top', ha='center') ax.text(sbplt_start+i+.5, num_rows+.5, view_x_tick_labels[i], font_kws) view_label_x = (sbplt_start+sbplt_end)/2. - .5 view_label_y = -2.5 font_kws = dict(ha='center', fontsize=view_label_fontsize, weight='bold') ax.text(view_label_x, view_label_y, view_labels[v], font_kws) y = 0 for cluster in sorted_clusters[view]: y_tick_labels += [row_idx_to_label[row] for row in sorted_rows[view][cluster]] ax.plot([sbplt_start-.5, sbplt_end-.5], [y-.5, y-.5], color=separator_color, lw=separator_width) for row, label in hl_row_idx_label_zip: try: pos = sorted_rows[view][cluster].index(row) except ValueError: pos = None if pos is not None: edgecolor = row_hl_colors[label] ax.add_patch(Rectangle((sbplt_start - .5, y + pos - .5), num_cols_view, 1, facecolor="none", edgecolor=edgecolor, lw=2, zorder=10)) y += len(sorted_rows[view][cluster]) for i, row in enumerate(range(num_rows-1, -1, -1)): if y_tick_labels[i] in hilight_rows: fontcolor = row_hl_colors[y_tick_labels[i]] fontsize = 'x-small' fontweight = 'bold' zorder = 10 else: fontsize = 'x-small' fontcolor = '#333333' fontweight = 'light' zorder = 5 ax.text(sbplt_start-1, i+.5, str(y_tick_labels[i]), ha='right', fontsize=fontsize, color=fontcolor, weight=fontweight, zorder=zorder) col_count += num_cols_view # generate row legend # Use matplotlib artists to generate a list of colored lines # TODO: Refactor legend generator into its own function if legend: if len(hilight_rows) > 0: row_legend = pu.gen_collapsed_legend_from_dict( row_hl_colors, loc=row_legend_loc, title=row_legend_title, fontsize=legend_fontsize, wrap_threshold=legend_wrap_threshold) ax.add_artist(row_legend) if len(hilight_cols) > 0: col_legend_labels = bu.get_shortnames(bdb, table_name, hilight_cols) if descriptions_in_legend: for i, col_id in enumerate(hilight_cols): col_legend_labels[i] += ': ' + bu.get_descriptions( bdb, table_name, [col_id])[0] col_legend_labels[i] = col_legend_labels[i] col_legend = pu.gen_collapsed_legend_from_dict( dict(zip(col_legend_labels, hilight_cols_colors)), loc=col_legend_loc, title=col_legend_title, fontsize=legend_fontsize, wrap_threshold=legend_wrap_threshold) ax.add_artist(col_legend) ax.tick_params(**{ 'axis': 'both', 'length': 0 }) ax.set_xlim([-.5, X.shape[1]]) ax.set_ylim([X.shape[0], -.5]) ax.spines['bottom'].set_color('white') ax.spines['top'].set_color('white') ax.spines['right'].set_color('white') ax.spines['left'].set_color('white') ax.set_yticks(range(num_rows)) ax.set_xticks(range(num_cols+num_views*border_width)) ax.tick_params(axis='x', colors='white') # ax.set_xticklabels(x_tick_labels, rotation=90, color='black', fontsize=9) ax.set_yticklabels(['']*num_rows) ax.tick_params(axis='y', colors='white') ax.grid(b=False) ax.set_axis_bgcolor('white') return ax
def comparative_hist(df, bdb=None, nbins=15, normed=False): """Plot a histogram. Given a one-column pandas.DataFrame, df, plots a simple histogram. Given a two-column df plots the data in column one colored by an optional column 2. If given, column 2 must be categorical. Parameters ---------- nbins : int Number of bins (bars) normed : bool If True, normalizes the the area of the histogram (or each sub-histogram if df has two columns) to 1. Returns ------- figure: matplotlib.figure.Figure """ df = df.dropna() vartype = get_bayesdb_col_type(df.columns[0], df[df.columns[0]], bdb=bdb) if vartype == 'categorical': values, labels, lookup = conv_categorical_vals_to_numeric( df[df.columns[0]]) df.ix[:, 0] = values bins = len(labels) ticklabels = [0] * len(labels) for key, val in lookup.iteritems(): ticklabels[val] = key else: a = min(df.ix[:, 0].values) b = max(df.ix[:, 0].values) support = b - a interval = support / nbins bins = np.linspace(a, b + interval, nbins) colorby = None if len(df.columns) > 1: if len(df.columns) > 2: raise BLE( NotImplementedError( 'comparative_hist not defined on more than two variables.') ) colorby = df.columns[1] colorby_vals = df[colorby].unique() figure, ax = plt.subplots(tight_layout=False, facecolor='white') if colorby is None: ax.hist(df.ix[:, 0].values, bins=bins, color='#383838', edgecolor='none', normed=normed) plot_title = df.columns[0] else: colors = sns.color_palette('deep', len(colorby_vals)) for color, cbv in zip(colors, colorby_vals): subdf = df[df[colorby] == cbv] ax.hist(subdf.ix[:, 0].values, bins=bins, color=color, alpha=.5, edgecolor='none', normed=normed, label=("%s (n=%d)" % (str(cbv), len(subdf)))) ax.legend(loc=0, title=colorby) plot_title = df.columns[0] + " by " + colorby if normed: plot_title += " (normalized)" ax.set_title(plot_title) ax.set_xlabel(df.columns[0]) return figure
def estimate_kl_divergence(bdb, generatorA, generatorB, targets=None, givens=None, n_samples=None): """Estimate the KL divergence. The KL divergence is a mesaure of the "information lost" when generatorB (the approximating generator) is used to approximate generatorA (the base generator). KL divergence is not symmetric in, and KL(genA||genB) is not necessarily equal to KL(genB||genA). TODO: Monte Carlo estimation is a terrible way to compute the KL divergence. (Not to say there are better methods in general). One illustration of this is that the estimated KL divergence has emperically been shown to obtain negative realizations for high-dimensional data. Computing the KL divergence in general (of high dimensional distributions) is a very hard problem; most research uses the structure of the distributions to find good estimators. Adaptive quadrature or exact methods for numerical integration could outperform Monte Carlo? TODO: More sophisticated algorithm for detecting cases where absolute continuity could be a problem (currently have a heuristic). As it stands, Monte Carlo estimates may have infinite variance depending on simulated values from generatorA. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. generatorA : str Name of base generator. generatorB : str Name of approximating generator. targets : list<str>, optional List of columns in the table for which to compute the log-likelihood. Defaults to all the columns. givens : list<tuple>, optional A list of [(column, value)] pairs on which to condition on. Defaults to no conditionals. See example for more details. n_samples: int, optional Number of simulated samples to use in the Monte Carlo estimate. Returns ------- kl : float The KL divergence. May be infinity. Example: estimate_kl_divergence(bdb, 'crosscat_gen', 'baxcat_gen', targets=['weight', 'height'], givens=[('nationality', 'USA'), ('age', 17)]) """ # XXX Default to 10,000 samples if n_samples is None: n_samples = 10000 # Defaults to all columns if targets is None. targets = extract_target_cols(bdb, generatorA, targets=targets) # Defaults to no givens if givens is None givens = extract_given_cols_vals(givens=givens) givens = ','.join(['{}={}'.format(c, v) for (c, v) in givens]) # Obtain samples from the base distribution. if givens: # XXX TODO write GIVEN in this query using bindings. bql = ''' SIMULATE {} FROM {} GIVEN {} LIMIT {} '''.format(','.join(targets), bql_quote_name(generatorA), givens, n_samples) else: bql = ''' SIMULATE {} FROM {} LIMIT {} '''.format(','.join(targets), bql_quote_name(generatorA), n_samples) samples = bdb.execute(bql) kl = 0 for s in samples: logp_a, logp_b = 0, 0 # XXX Assume joint probability factors by summing univariate # (conditional) probability of each cell value. This is clearly wrong, # until we can evaluate joint densities in BQL. for col, val in zip(targets, s): bql = ''' ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1 '''.format(col, bql_quote_name(generatorA)) crs = bdb.execute(bql, (val, )) p_a = crs.fetchvalue() bql = ''' ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1 '''.format(col, bql_quote_name(generatorB)) crs = bdb.execute(bql, (val, )) p_b = crs.fetchvalue() # XXX Heuristic to detect when genA is not absolutely # continuous wrt genB if p_a == 0: # How on earth did we simulate a value from genA with zero # density/prob under genA? raise BLE( ValueError( 'Fatal error: simulated a (col,val)=({},{}) ' 'from base generatorA ({}) with zero density. Check ' 'implementation of simluate and/or logpdf of ' 'generator.'.format(col, val, generatorA))) if p_b == 0: # Detected failure of absolute continuity # (under assumption that joint factors into marginals) return float('inf') logp_a += math.log(p_a) logp_b += math.log(p_b) kl += (logp_a - logp_b) # XXX Assertion may fail, see TODO in docstring. # assert kl > 0 if kl < 0: raise BLE( ValueError( 'Cannot compute reasonable value for KL divergence. ' 'Try increasing the number of samples (currently using {}' 'samples).'.format(n_samples))) return kl / n_samples
def histogram(bdb, df, nbins=15, bins=None, normed=None): """Plot histogram of one- or two-column table. If two-column, subdivide the first column according to labels in the second column Parameters ---------- bdb : __population_to_bdb__ df : __specifier_to_df__ nbins : int, optional Number of bins in the histogram. normed : bool, optional If True, normalizes the the area of the histogram (or each sub-histogram if df has two columns) to 1. Returns ---------- figure: matplotlib.figure.Figure """ df = df.dropna() if len(df.columns) == 0: raise BLE(ValueError('Tried to plot a histogram of an empty result.')) vartype = get_bayesdb_col_type(df.columns[0], df[df.columns[0]], bdb=bdb) if vartype == 'categorical': raise BLE( TypeError( "Cannot histogram categorical varible %s. Barplot? Colorby?" % (df.columns[0], ))) if nbins is None: nbins = len(bins) if bins is not None else 15 if bins is None: a = min(df.ix[:, 0].values) b = max(df.ix[:, 0].values) support = b - a interval = support / nbins bins = np.linspace(a, b + interval, nbins) colorby = None if len(df.columns) > 1: if len(df.columns) > 2: raise BLE(ValueError('Got more columns than data and colorby.')) colorby = df.columns[1] colorby_stattype = get_bayesdb_col_type(df.columns[1], df[df.columns[1]], bdb=bdb) if colorby_stattype != 'categorical': raise BLE( TypeError("Cannot color by non-categorical variable " + colorby)) colorby_vals = df[colorby].unique() figure, ax = plt.subplots(tight_layout=False, facecolor='white') if colorby is None: ax.hist(df.ix[:, 0].values, bins=bins, color='#383838', edgecolor='none', normed=normed) plot_title = df.columns[0] else: colors = sns.color_palette('deep', len(colorby_vals)) for color, cbv in zip(colors, colorby_vals): subdf = df[df[colorby] == cbv] ax.hist(subdf.ix[:, 0].values, bins=bins, color=color, alpha=.5, edgecolor='none', normed=normed, label=str(cbv)) ax.legend(loc=0, title=colorby) plot_title = df.columns[0] + " by " + colorby if normed: plot_title += " (normalized)" ax.set_title(plot_title) ax.set_xlabel(df.columns[0]) return figure