示例#1
0
def test_transition_hypers(cctype):
    name, arg = cctype
    model = cu.cctype_class(name)(outputs=[0],
                                  inputs=None,
                                  distargs=arg,
                                  rng=gu.gen_rng(10))
    D, Zv, Zc = tu.gen_data_table(50, [1], [[.33, .33, .34]], [name], [arg],
                                  [.8],
                                  rng=gu.gen_rng(1))

    hypers_previous = model.get_hypers()
    for rowid, x in enumerate(np.ravel(D)[:25]):
        model.incorporate(rowid, {0: x}, None)
    model.transition_hypers(N=3)
    hypers_new = model.get_hypers()
    assert not all(
        np.allclose(hypers_new[hyper], hypers_previous[hyper])
        for hyper in hypers_new)

    for rowid, x in enumerate(np.ravel(D)[:25]):
        model.incorporate(rowid + 25, {0: x}, None)
    model.transition_hypers(N=3)
    hypers_newer = model.get_hypers()
    assert not all(
        np.allclose(hypers_new[hyper], hypers_newer[hyper])
        for hyper in hypers_newer)
示例#2
0
 def update_cctype(self, col, cctype, distargs=None):
     """Update the distribution type of self.dims[col] to cctype."""
     if distargs is None:
         distargs = {}
     distargs_dim = dict(distargs)
     inputs = []
     # XXX Horrid hack.
     if cctype_class(cctype).is_conditional():
         inputs = distargs_dim.get('inputs', [
             d for d in sorted(self.dims)
             if d != col and not self.dims[d].is_conditional()
         ])
         if len(self.dims) == 0 or len(inputs) == 0:
             raise ValueError('No inputs for conditional dimension.')
         distargs_dim['inputs'] = {
             'indexes': inputs,
             'stattypes': [self.dims[i].cctype for i in inputs],
             'statargs': [self.dims[i].get_distargs() for i in inputs]
         }
     D_old = self.dims[col]
     D_new = Dim(outputs=[col],
                 inputs=[self.outputs[0]] + inputs,
                 cctype=cctype,
                 distargs=distargs_dim,
                 rng=self.rng)
     self.unincorporate_dim(D_old)
     self.incorporate_dim(D_new)
示例#3
0
def plot_simulations(cctype, D_train, D_test, D_posterior):
    model = cu.cctype_class(cctype)
    if model.is_continuous():
        fig, ax = _plot_simulations_continuous(D_train, D_test, D_posterior)
    else:
        fig, ax = _plot_simulations_discrete(D_train, D_test, D_posterior)
    fig.suptitle(cctype, fontsize=16, fontweight='bold')
    fig.set_size_inches(8, 6)
    fig.savefig('resources/%s-%s' % (cu.timestamp(), cctype), dpi=100)
    plt.close('all')
示例#4
0
 def _predictor_count(cct, cca):
     # XXX Determine statistical types and arguments of inputs.
     if cct == 'numerical' or cu.cctype_class(cct).is_numeric():
         p, counts = 1, None
     elif cca is not None and 'k' in cca:
         # In dummy coding, if the category has values {1,...,K} then its
         # code contains (K-1) entries, where all zeros indicates value K.
         p, counts = cca['k'] - 1, int(cca['k'])
     else:
         raise ValueError('Invalid stattype, stargs: %s, %s.' % (cct, cca))
     return int(p), counts
示例#5
0
def two_sample_test(cctype, X, Y):
    model = cu.cctype_class(cctype)
    if model.is_numeric():  # XXX WRONG CHOICE FOR DISCRETE NUMERIC XXX
        _, pval = ks_2samp(X, Y)
    else:
        Xb, Yb = aligned_bincount([X, Y])
        ignore = np.logical_and(Xb == 0, Yb == 0)
        Xb, Yb = Xb[np.logical_not(ignore)], Yb[np.logical_not(ignore)]
        Xb = Xb / float(sum(Xb)) * 1000
        Yb = Yb / float(sum(Yb)) * 1000
        _, pval = chisquare(Yb, f_exp=Xb)
    return pval
示例#6
0
文件: linreg.py 项目: wilsondy/cgpm
 def _predictor_count(cct, cca):
     # XXX Determine statistical types and arguments of inputs.
     if cct == 'numerical' or cu.cctype_class(cct).is_numeric():
         p, counts = 1, None
     elif cca is not None and 'k' in cca:
         # In dummy coding, if the category has values {1,...,K} then its
         # code contains (K-1) entries, where all zeros indicates value K.
         # However, we are going to treat all zeros indicating the input to
         # be a "wildcard" category, so that the code has K entries. This
         # way the queries are robust to unspecified or misspecified
         # categories.
         p, counts = cca['k'], int(cca['k'])+1
     return int(p), counts
示例#7
0
def generate_gpmcc_posteriors(cctype, distargs, D_train, iters, seconds):
    """Learns gpmcc on D_train for seconds and simulates NUM_TEST times."""
    # Learning and posterior simulation.
    engine = Engine(D_train,
                    cctypes=[cctype],
                    distargs=[distargs],
                    num_states=64,
                    rng=gu.gen_rng(1))
    engine.transition(N=iters, S=seconds, progress=0)
    if iters:
        kernel = 'column_params' if cu.cctype_class(cctype).is_conditional()\
            else 'column_hypers'
        engine.transition(N=100, kernels=[kernel], progress=0)
    samples = engine.simulate(-1, [0], N=NUM_TEST)
    marginals = engine.logpdf_score()
    ranking = np.argsort(marginals)[::-1]
    for r in ranking[:5]:
        engine.get_state(r).plot()
    return [samples[i] for i in ranking[:5]]
示例#8
0
def assert_distribution(cctype, outputs, inputs, distargs, good, bad):
    model = cu.cctype_class(cctype)(outputs, inputs, distargs=distargs)
    for rowid, g in enumerate(good):
        assert_good(model, rowid, g)
    for rowid, b in enumerate(bad):
        assert_bad(model, rowid, b)
示例#9
0
文件: dim.py 项目: vishalbelsare/cgpm
    def __init__(self,
                 outputs,
                 inputs,
                 cctype=None,
                 hypers=None,
                 params=None,
                 distargs=None,
                 rng=None):
        """Dim constructor provides a convenience method for bulk incorporate
        and unincorporate by specifying the data and optional row partition.

        Parameters
        ----------
        cctype : str
             DistributionGpm name see `cgpm.utils.config`.
        outputs : list<int>
            A singleton list containing the identifier of the output variable.
        inputs : list<int>
            A list of at least length 1. The first item is the index of the
            variable corresponding to the required cluster identity. The
            remaining items are input variables to the internal cgpms.
        cctypes : str, optional
            Data type of output variable, defaults to normal.
        hypers : dict, optional
            Shared hypers of internal cgpms.
        params : dict, optional
            Currently disabled.
        distargs : dict, optional.
            Distargs appropriate for the cctype.
        rng : np.random.RandomState, optional.
            Source of entropy.
        """
        # -- Seed --------------------------------------------------------------
        self.rng = gu.gen_rng() if rng is None else rng

        # -- Outputs -----------------------------------------------------------
        if len(outputs) != 1:
            raise ValueError('Dim requires exactly 1 output.')
        self.outputs = list(outputs)

        # -- Inputs ------------------------------------------------------------
        if len(inputs) < 1:
            raise ValueError('Dim requires at least 1 input.')
        self.inputs = list(inputs)

        # -- Identifier --------------------------------------------------------
        self.index = self.outputs[0]

        # -- DistributionCGpms -------------------------------------------------
        self.model = cu.cctype_class(cctype)
        self.cctype = self.model.name()
        self.distargs = dict(distargs) if distargs is not None else {}

        # -- Hyperparameters ---------------------------------------------------
        self.hyper_grids = {}
        self.hypers = dict(hypers) if hypers is not None else {}

        # -- Clusters and Assignments ------------------------------------------
        self.clusters = {}  # Mapping of cluster k to the object.
        self.Zr = {}  # Mapping of non-nan rowids to cluster k.
        self.Zi = {}  # Mapping of nan rowids to cluster k.

        # -- Auxiliary Singleton ---- ------------------------------------------
        self.aux_model = self.create_aux_model()