def run(args): ''' Demo of inference confidence `args` is a dict with the following keys n_samples : int the number of samples for imputation n_steps : int the number of corsscat iterations to use for confidence n_modes : int the number of modes from which to draw the sample data mean_std : float (greater than 0) the standard deviation between the means of the sample modes std_std : float (greater than 0) the standard deviation between the standard deviation of the modes seed : int RNG seed. If seed < 0, system time is used. ''' n_samples = args['n_samples'] n_steps = args['n_steps'] n_modes = args['n_modes'] mean_std = args['mean_std'] std_std = args['std_std'] seed = args['seed'] if seed < 0: import time seed = int(time.time()) np.random.seed(seed) means = np.random.normal(0, mean_std, n_modes) stds = 1/np.random.gamma(1, 1/std_std, n_modes) stds = [1.0]*n_modes samples = np.zeros(n_samples) for i in range(n_samples): mode = np.random.randint(n_modes) samples[i] = np.random.normal(means[mode], stds[mode]) imputed = np.median(samples) conf, X_L_list, X_D_list = su.continuous_imputation_confidence( samples, imputed, (), n_steps=n_steps, return_metadata=True) results = { 'config': args, 'conf': conf, 'samples': samples, 'X_L_list': X_L_list, 'X_D_list': X_D_list, } return results
def _predict_confidence(self, bdb, genid, modelno, colno, rowid, numsamples=None): # Predicts a value for the cell [rowid, colno] with a confidence metric. # XXX Prefer accuracy over speed for imputation. if numsamples is None: numsamples = self.n_samples colnos = core.bayesdb_generator_column_numbers(bdb, genid) colnames = core.bayesdb_generator_column_names(bdb, genid) row = core.bayesdb_generator_row_values(bdb, genid, rowid) # Account for multiple imputations if imputing parents. parent_conf = 1 # Predicting lcol. if colno in self.lcols(bdb, genid): # Delegate to CC IFF # (lcol has no children OR all its children are None). children = [f for f in self.fcols(bdb, genid) if colno in self.pcols(bdb, genid, f)] if len(children) == 0 or \ all(row[i] is None for i in xrange(len(row)) if i+1 in children): return self.cc(bdb, genid).predict_confidence(bdb, self.cc_id(bdb, genid), modelno, self.cc_colno(bdb, genid, colno), rowid) else: # Obtain likelihood weighted samples from posterior. Q = [(rowid, colno)] Y = [(rowid, c, v) for c,v in zip(colnos, row) if c != colno and v is not None] samples = self.simulate(bdb, genid, modelno, Q, Y, numpredictions=numsamples) samples = [s[0] for s in samples] # Predicting fcol. else: conditions = {c:v for c,v in zip(colnames, row) if core.bayesdb_generator_column_number(bdb, genid, c) in self.pcols(bdb, genid, colno)} for colname, val in conditions.iteritems(): # Impute all missing parents. if val is None: imp_col = core.bayesdb_generator_column_number(bdb, genid, colname) imp_val, imp_conf = self.predict_confidence(bdb, genid, modelno, imp_col, rowid, numsamples=numsamples) # XXX If imputing several parents, take the overall # overall conf as min conf. If we define imp_conf as # P[imp_val = correct] then we might choose to multiply # the imp_confs, but we cannot assert that the imp_confs # are independent so multiplying is extremely conservative. parent_conf = min(parent_conf, imp_conf) conditions[colname] = imp_val assert all(v is not None for c,v in conditions.iteritems()) predictor = self.predictor(bdb, genid, colno) samples = predictor.simulate(numsamples, conditions) # Since foreign predictor does not know how to impute, imputation # shall occur here in the composer by simulate/logpdf calls. stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno) if stattype == 'categorical': # imp_conf is most frequent. imp_val = max(((val, samples.count(val)) for val in set(samples)), key=lambda v: v[1])[0] if colno in self.fcols(bdb, genid): imp_conf = np.exp(predictor.logpdf(imp_val, conditions)) else: imp_conf = sum(np.array(samples)==imp_val) / len(samples) elif stattype == 'numerical': # XXX The definition of confidence is P[k=1] where # k=1 is the number of mixture componets (we need a distribution # over GPMM to answer this question). The confidence is instead # implemented as \max_i{p_i} where p_i are the weights of a # fitted DPGMM. imp_val = np.mean(samples) imp_conf = su.continuous_imputation_confidence(samples, None, None, n_steps=1000) else: raise ValueError('Unknown stattype "{}" for a foreign predictor ' 'column encountered in predict_confidence.'.format(stattype)) return imp_val, imp_conf * parent_conf