Пример #1
0
def test_transition_hypers(cctype):
    name, arg = cctype
    model = cu.cctype_class(name)(outputs=[0],
                                  inputs=None,
                                  distargs=arg,
                                  rng=gu.gen_rng(10))
    D, Zv, Zc = tu.gen_data_table(50, [1], [[.33, .33, .34]], [name], [arg],
                                  [.8],
                                  rng=gu.gen_rng(1))

    hypers_previous = model.get_hypers()
    for rowid, x in enumerate(np.ravel(D)[:25]):
        model.incorporate(rowid, {0: x}, None)
    model.transition_hypers(N=3)
    hypers_new = model.get_hypers()
    assert not all(
        np.allclose(hypers_new[hyper], hypers_previous[hyper])
        for hyper in hypers_new)

    for rowid, x in enumerate(np.ravel(D)[:25]):
        model.incorporate(rowid + 25, {0: x}, None)
    model.transition_hypers(N=3)
    hypers_newer = model.get_hypers()
    assert not all(
        np.allclose(hypers_new[hyper], hypers_newer[hyper])
        for hyper in hypers_newer)
Пример #2
0
 def update_cctype(self, col, cctype, distargs=None):
     """Update the distribution type of self.dims[col] to cctype."""
     if distargs is None:
         distargs = {}
     distargs_dim = dict(distargs)
     inputs = []
     # XXX Horrid hack.
     if cctype_class(cctype).is_conditional():
         inputs = distargs_dim.get('inputs', [
             d for d in sorted(self.dims)
             if d != col and not self.dims[d].is_conditional()
         ])
         if len(self.dims) == 0 or len(inputs) == 0:
             raise ValueError('No inputs for conditional dimension.')
         distargs_dim['inputs'] = {
             'indexes': inputs,
             'stattypes': [self.dims[i].cctype for i in inputs],
             'statargs': [self.dims[i].get_distargs() for i in inputs]
         }
     D_old = self.dims[col]
     D_new = Dim(outputs=[col],
                 inputs=[self.outputs[0]] + inputs,
                 cctype=cctype,
                 distargs=distargs_dim,
                 rng=self.rng)
     self.unincorporate_dim(D_old)
     self.incorporate_dim(D_new)
Пример #3
0
def plot_simulations(cctype, D_train, D_test, D_posterior):
    model = cu.cctype_class(cctype)
    if model.is_continuous():
        fig, ax = _plot_simulations_continuous(D_train, D_test, D_posterior)
    else:
        fig, ax = _plot_simulations_discrete(D_train, D_test, D_posterior)
    fig.suptitle(cctype, fontsize=16, fontweight='bold')
    fig.set_size_inches(8, 6)
    fig.savefig('resources/%s-%s' % (cu.timestamp(), cctype), dpi=100)
    plt.close('all')
Пример #4
0
 def _predictor_count(cct, cca):
     # XXX Determine statistical types and arguments of inputs.
     if cct == 'numerical' or cu.cctype_class(cct).is_numeric():
         p, counts = 1, None
     elif cca is not None and 'k' in cca:
         # In dummy coding, if the category has values {1,...,K} then its
         # code contains (K-1) entries, where all zeros indicates value K.
         p, counts = cca['k'] - 1, int(cca['k'])
     else:
         raise ValueError('Invalid stattype, stargs: %s, %s.' % (cct, cca))
     return int(p), counts
Пример #5
0
def two_sample_test(cctype, X, Y):
    model = cu.cctype_class(cctype)
    if model.is_numeric():  # XXX WRONG CHOICE FOR DISCRETE NUMERIC XXX
        _, pval = ks_2samp(X, Y)
    else:
        Xb, Yb = aligned_bincount([X, Y])
        ignore = np.logical_and(Xb == 0, Yb == 0)
        Xb, Yb = Xb[np.logical_not(ignore)], Yb[np.logical_not(ignore)]
        Xb = Xb / float(sum(Xb)) * 1000
        Yb = Yb / float(sum(Yb)) * 1000
        _, pval = chisquare(Yb, f_exp=Xb)
    return pval
Пример #6
0
 def _predictor_count(cct, cca):
     # XXX Determine statistical types and arguments of inputs.
     if cct == 'numerical' or cu.cctype_class(cct).is_numeric():
         p, counts = 1, None
     elif cca is not None and 'k' in cca:
         # In dummy coding, if the category has values {1,...,K} then its
         # code contains (K-1) entries, where all zeros indicates value K.
         # However, we are going to treat all zeros indicating the input to
         # be a "wildcard" category, so that the code has K entries. This
         # way the queries are robust to unspecified or misspecified
         # categories.
         p, counts = cca['k'], int(cca['k'])+1
     return int(p), counts
Пример #7
0
def generate_gpmcc_posteriors(cctype, distargs, D_train, iters, seconds):
    """Learns gpmcc on D_train for seconds and simulates NUM_TEST times."""
    # Learning and posterior simulation.
    engine = Engine(D_train,
                    cctypes=[cctype],
                    distargs=[distargs],
                    num_states=64,
                    rng=gu.gen_rng(1))
    engine.transition(N=iters, S=seconds, progress=0)
    if iters:
        kernel = 'column_params' if cu.cctype_class(cctype).is_conditional()\
            else 'column_hypers'
        engine.transition(N=100, kernels=[kernel], progress=0)
    samples = engine.simulate(-1, [0], N=NUM_TEST)
    marginals = engine.logpdf_score()
    ranking = np.argsort(marginals)[::-1]
    for r in ranking[:5]:
        engine.get_state(r).plot()
    return [samples[i] for i in ranking[:5]]
Пример #8
0
def assert_distribution(cctype, outputs, inputs, distargs, good, bad):
    model = cu.cctype_class(cctype)(outputs, inputs, distargs=distargs)
    for rowid, g in enumerate(good):
        assert_good(model, rowid, g)
    for rowid, b in enumerate(bad):
        assert_bad(model, rowid, b)
Пример #9
0
    def __init__(self,
                 outputs,
                 inputs,
                 cctype=None,
                 hypers=None,
                 params=None,
                 distargs=None,
                 rng=None):
        """Dim constructor provides a convenience method for bulk incorporate
        and unincorporate by specifying the data and optional row partition.

        Parameters
        ----------
        cctype : str
             DistributionGpm name see `cgpm.utils.config`.
        outputs : list<int>
            A singleton list containing the identifier of the output variable.
        inputs : list<int>
            A list of at least length 1. The first item is the index of the
            variable corresponding to the required cluster identity. The
            remaining items are input variables to the internal cgpms.
        cctypes : str, optional
            Data type of output variable, defaults to normal.
        hypers : dict, optional
            Shared hypers of internal cgpms.
        params : dict, optional
            Currently disabled.
        distargs : dict, optional.
            Distargs appropriate for the cctype.
        rng : np.random.RandomState, optional.
            Source of entropy.
        """
        # -- Seed --------------------------------------------------------------
        self.rng = gu.gen_rng() if rng is None else rng

        # -- Outputs -----------------------------------------------------------
        if len(outputs) != 1:
            raise ValueError('Dim requires exactly 1 output.')
        self.outputs = list(outputs)

        # -- Inputs ------------------------------------------------------------
        if len(inputs) < 1:
            raise ValueError('Dim requires at least 1 input.')
        self.inputs = list(inputs)

        # -- Identifier --------------------------------------------------------
        self.index = self.outputs[0]

        # -- DistributionCGpms -------------------------------------------------
        self.model = cu.cctype_class(cctype)
        self.cctype = self.model.name()
        self.distargs = dict(distargs) if distargs is not None else {}

        # -- Hyperparameters ---------------------------------------------------
        self.hyper_grids = {}
        self.hypers = dict(hypers) if hypers is not None else {}

        # -- Clusters and Assignments ------------------------------------------
        self.clusters = {}  # Mapping of cluster k to the object.
        self.Zr = {}  # Mapping of non-nan rowids to cluster k.
        self.Zi = {}  # Mapping of nan rowids to cluster k.

        # -- Auxiliary Singleton ---- ------------------------------------------
        self.aux_model = self.create_aux_model()