예제 #1
0
def run(args):
    ''' Demo of inference confidence

    `args` is a dict with the following keys

    n_samples : int
        the number of samples for imputation
    n_steps : int
        the number of corsscat iterations to use for confidence
    n_modes : int
        the number of modes from which to draw the sample data
    mean_std : float (greater than 0)
        the standard deviation between the means of the sample modes
    std_std : float (greater than 0)
        the standard deviation between the standard deviation of the modes
    seed : int
        RNG seed. If seed < 0, system time is used.

    '''
    n_samples = args['n_samples']
    n_steps = args['n_steps']
    n_modes = args['n_modes']
    mean_std = args['mean_std']
    std_std = args['std_std']
    seed = args['seed']

    if seed < 0:
        import time
        seed = int(time.time())

    np.random.seed(seed)
    means = np.random.normal(0, mean_std, n_modes)
    stds = 1/np.random.gamma(1, 1/std_std, n_modes)
    stds = [1.0]*n_modes

    samples = np.zeros(n_samples)

    for i in range(n_samples):
        mode = np.random.randint(n_modes)
        samples[i] = np.random.normal(means[mode], stds[mode])

    imputed = np.median(samples)
    conf, X_L_list, X_D_list = su.continuous_imputation_confidence(
        samples, imputed, (), n_steps=n_steps, return_metadata=True)

    results = {
        'config': args,
        'conf': conf,
        'samples': samples,
        'X_L_list': X_L_list,
        'X_D_list': X_D_list,
    }

    return results
예제 #2
0
 def _predict_confidence(self, bdb, genid, modelno, colno, rowid,
         numsamples=None):
     # Predicts a value for the cell [rowid, colno] with a confidence metric.
     # XXX Prefer accuracy over speed for imputation.
     if numsamples is None:
         numsamples = self.n_samples
     colnos = core.bayesdb_generator_column_numbers(bdb, genid)
     colnames = core.bayesdb_generator_column_names(bdb, genid)
     row = core.bayesdb_generator_row_values(bdb, genid, rowid)
     # Account for multiple imputations if imputing parents.
     parent_conf = 1
     # Predicting lcol.
     if colno in self.lcols(bdb, genid):
         # Delegate to CC IFF
         # (lcol has no children OR all its children are None).
         children = [f for f in self.fcols(bdb, genid) if colno in
                 self.pcols(bdb, genid, f)]
         if len(children) == 0 or \
                 all(row[i] is None for i in xrange(len(row)) if i+1
                     in children):
             return self.cc(bdb, genid).predict_confidence(bdb,
                     self.cc_id(bdb, genid), modelno,
                     self.cc_colno(bdb, genid, colno), rowid)
         else:
             # Obtain likelihood weighted samples from posterior.
             Q = [(rowid, colno)]
             Y = [(rowid, c, v) for c,v in zip(colnos, row)
                  if c != colno and v is not None]
             samples = self.simulate(bdb, genid, modelno, Q, Y,
                 numpredictions=numsamples)
             samples = [s[0] for s in samples]
     # Predicting fcol.
     else:
         conditions = {c:v for c,v in zip(colnames, row) if
             core.bayesdb_generator_column_number(bdb, genid, c) in
             self.pcols(bdb, genid, colno)}
         for colname, val in conditions.iteritems():
             # Impute all missing parents.
             if val is None:
                 imp_col = core.bayesdb_generator_column_number(bdb, genid,
                     colname)
                 imp_val, imp_conf = self.predict_confidence(bdb, genid,
                     modelno, imp_col, rowid, numsamples=numsamples)
                 # XXX If imputing several parents, take the overall
                 # overall conf as min conf. If we define imp_conf as
                 # P[imp_val = correct] then we might choose to multiply
                 # the imp_confs, but we cannot assert that the imp_confs
                 # are independent so multiplying is extremely conservative.
                 parent_conf = min(parent_conf, imp_conf)
                 conditions[colname] = imp_val
         assert all(v is not None for c,v in conditions.iteritems())
         predictor = self.predictor(bdb, genid, colno)
         samples = predictor.simulate(numsamples, conditions)
     # Since foreign predictor does not know how to impute, imputation
     # shall occur here in the composer by simulate/logpdf calls.
     stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno)
     if stattype == 'categorical':
         # imp_conf is most frequent.
         imp_val =  max(((val, samples.count(val)) for val in set(samples)),
             key=lambda v: v[1])[0]
         if colno in self.fcols(bdb, genid):
             imp_conf = np.exp(predictor.logpdf(imp_val, conditions))
         else:
             imp_conf = sum(np.array(samples)==imp_val) / len(samples)
     elif stattype == 'numerical':
         # XXX The definition of confidence is P[k=1] where
         # k=1 is the number of mixture componets (we need a distribution
         # over GPMM to answer this question). The confidence is instead
         # implemented as \max_i{p_i} where p_i are the weights of a
         # fitted DPGMM.
         imp_val = np.mean(samples)
         imp_conf = su.continuous_imputation_confidence(samples, None, None,
             n_steps=1000)
     else:
         raise ValueError('Unknown stattype "{}" for a foreign predictor '
             'column encountered in predict_confidence.'.format(stattype))
     return imp_val, imp_conf * parent_conf