示例#1
0
文件: runner.py 项目: jzf2101/lda
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        doc_model = bind(self._latent, data=self._view)
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    assign2(doc_model, r)
                    tabel_models = [
                        bind(self._latent, document=did)
                        for did in xrange(self._latent.nentities())
                    ]
                    for table_model in tabel_models:
                        assign(table_model, r)
                else:
                    assert False, 'should not be reached'
示例#2
0
文件: runner.py 项目: jzf2101/lda
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        doc_model = bind(self._latent, data=self._view)
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    assign2(doc_model, r)
                    tabel_models = [
                        bind(self._latent, document=did)
                        for did in xrange(self._latent.nentities())
                    ]
                    for table_model in tabel_models:
                        assign(table_model, r)
                else:
                    assert False, 'should not be reached'
示例#3
0
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        inds = xrange(len(self._defn.domains()))
        models = [bind(self._latent, i, self._views) for i in inds]
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    for idx in config.keys():
                        gibbs.assign(models[idx], r)
                elif name == 'assign_resample':
                    for idx, v in config.iteritems():
                        gibbs.assign_resample(models[idx], v['m'], r)
                elif name == 'slice_cluster_hp':
                    for idx, v in config.iteritems():
                        slice.hp(models[idx], r, cparam=v['cparam'])
                elif name == 'grid_relation_hp':
                    gibbs.hp(models[0], config, r)
                elif name == 'slice_relation_hp':
                    slice.hp(models[0], r, hparams=config['hparams'])
                elif name == 'theta':
                    slice.theta(models[0], r, tparams=config['tparams'])
                else:
                    assert False, "should not be reached"
示例#4
0
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        model = bind(self._latent, self._view)
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    gibbs.assign(model, r)
                elif name == 'assign_resample':
                    gibbs.assign_resample(model, config['m'], r)
                elif name == 'grid_feature_hp':
                    gibbs.hp(model, config, r)
                elif name == 'slice_feature_hp':
                    slice.hp(model, r, hparams=config['hparams'])
                elif name == 'slice_cluster_hp':
                    slice.hp(model, r, cparam=config['cparam'])
                elif name == 'theta':
                    slice.theta(model, r, tparams=config['tparams'])
                else:
                    assert False, "should not be reach"
示例#5
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'crf':
                    lda_crp_gibbs(self._latent, r)
                elif name == 'direct_base_dp_hp':
                    sample_gamma(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_second_dp_hp':
                    sample_alpha(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_vocab_hp':
                    raise NotImplementedError(
                        'direct_vocab_hp not yet implemented')
                else:
                    raise ValueError(
                        "Bad kernel specification {}".format(name))
示例#6
0
    def run(self, r, niters=10000):
        """Run each runner for `niters`, using the backend supplied in the
        constructor for parallelism.

        Parameters
        ----------
        r : rng
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        if self._backend == 'multiprocessing':
            pool = mp.Pool(processes=self._processes)
            args = [(runner, niters, r.next(), None)
                    for runner in self._runners]
            # map_async() + get() allows us to workaround a bug where
            # control-C doesn't kill multiprocessing workers
            self._runners = pool.map_async(_mp_work, args).get(10000000)
            pool.close()
            pool.join()
        elif self._backend == 'multyvac':

            # XXX(stephentu): the only parallelism strategy thus far is every
            # runner gets a dedicated core (multicore=1) on a machine
            jids = []
            has_volume = bool(self._volume)
            zipped = zip(self._runners, self._digests)
            expensive_states = []
            for i, (runner, digest) in enumerate(zipped):
                if has_volume:
                    statearg = (self._volume, 'state-{}'.format(digest))
                    expensive_states.append(runner.expensive_state)
                    runner.expensive_state = None
                else:
                    statearg = None
                args = (runner, niters, r.next(), statearg)
                jids.append(
                    multyvac.submit(
                        _mp_work,
                        args,
                        _ignore_module_dependencies=True,
                        _layer=self._layer,
                        _vol=self._volume,
                        _env=dict(self._env),  # submit() mutates the env
                        _core=self._core,
                        _name='kernels-parallel-runner-{}'.format(i)))
            self._runners = [multyvac.get(jid).get_result() for jid in jids]
            if not expensive_states:
                return
            for runner, state in zip(self._runners, expensive_states):
                runner.expensive_state = state
        else:
            assert False, 'should not be reached'
示例#7
0
    def run(self, r, niters=10000):
        """Run each runner for `niters`, using the backend supplied in the
        constructor for parallelism.

        Parameters
        ----------
        r : rng
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        if self._backend == 'multiprocessing':
            pool = mp.Pool(processes=self._processes)
            args = [(runner, niters, r.next(), None)
                    for runner in self._runners]
            # map_async() + get() allows us to workaround a bug where
            # control-C doesn't kill multiprocessing workers
            self._runners = pool.map_async(_mp_work, args).get(10000000)
            pool.close()
            pool.join()
        elif self._backend == 'multyvac':

            # XXX(stephentu): the only parallelism strategy thus far is every
            # runner gets a dedicated core (multicore=1) on a machine
            jids = []
            has_volume = bool(self._volume)
            zipped = zip(self._runners, self._digests)
            expensive_states = []
            for i, (runner, digest) in enumerate(zipped):
                if has_volume:
                    statearg = (self._volume, 'state-{}'.format(digest))
                    expensive_states.append(runner.expensive_state)
                    runner.expensive_state = None
                else:
                    statearg = None
                args = (runner, niters, r.next(), statearg)
                jids.append(
                    multyvac.submit(
                        _mp_work,
                        args,
                        _ignore_module_dependencies=True,
                        _layer=self._layer,
                        _vol=self._volume,
                        _env=dict(self._env),  # submit() mutates the env
                        _core=self._core,
                        _name='kernels-parallel-runner-{}'.format(i)))
            self._runners = [multyvac.get(jid).get_result() for jid in jids]
            if not expensive_states:
                return
            for runner, state in zip(self._runners, expensive_states):
                runner.expensive_state = state
        else:
            assert False, 'should not be reached'
示例#8
0
    def __init__(self, defn, view, latent, kernel_config):
        validator.validate_type(defn, model_definition, 'defn')
        # validator.validate_type(view, abstract_dataview, 'view') # for now, view is actually a list of lists
        validator.validate_type(latent, state, 'latent')

        self._defn = defn
        self._view = view
        self._latent = latent

        self._kernel_config = []
        for kernel in kernel_config:
            if hasattr(kernel, '__iter__'):
                name, config = kernel
            else:
                name, config = kernel, {}
            validator.validate_dict_like(config)

            if name == 'beam':
                pass
            elif name == 'hypers':
                if 'alpha' in config:
                    assert 'alpha_a' not in config and 'alpha_b' not in config
                    alpha = config['alpha']
                    assert alpha > 0
                    latent.fix_alpha(alpha)
                elif 'alpha_a' in config and 'alpha_b' in config:
                    assert 'alpha' not in config
                    alpha_a = config['alpha_a']
                    alpha_b = config['alpha_b']
                    assert alpha_a > 0 and alpha_b > 0
                    latent.set_alpha_hypers(alpha_a, alpha_b)
                else:
                    raise ValueError("Configuration missing parameters for alpha0")

                if 'gamma' in config:
                    assert 'gamma_a' not in config and 'gamma_b' not in config
                    gamma = config['gamma']
                    assert gamma > 0
                    latent.fix_gamma(gamma)
                elif 'gamma_a' in config and 'gamma_b' in config:
                    assert 'gamma' not in config
                    gamma_a = config['gamma_a']
                    gamma_b = config['gamma_b']
                    assert gamma_a > 0 and gamma_b > 0
                    latent.set_gamma_hypers(gamma_a, gamma_b)
                else:
                    raise ValueError("Configuration missing parameters for gamma")
            else:
                raise ValueError("bad kernel found: {}".format(name))

        self._kernel_config.append((name, config))
示例#9
0
文件: runner.py 项目: rockhowse/lda
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            lda_crp_gibbs(self._latent, r)
示例#10
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            lda_crp_gibbs(self._latent, r)
示例#11
0
def default_assign_kernel_config(defn):
    """Creates a default kernel configuration for sampling the assignment
    (clustering) vector for every domain. The default kernel is currently a
    gibbs sampler.

    Parameters
    ----------
    defn : irm definition

    """
    validator.validate_type(defn, model_definition, 'defn')

    # XXX(stephentu): model_descriptors should implement
    # is_conjugate()

    def is_nonconj(x):
        return x.name() == 'bbnc'

    conj_inds, nonconj_inds = [], []
    for idx, m in enumerate(defn.relation_models()):
        lst = nonconj_inds if is_nonconj(m) else conj_inds
        lst.append(idx)

    nonconj_domains = set()
    for idx in nonconj_inds:
        for did in defn.relations()[idx]:
            nonconj_domains.add(did)
    conj_domains = [did for did in xrange(len(defn.domains()))
                    if did not in nonconj_domains]
    nonconj_domains = list(nonconj_domains)

    assign_kernel = ('assign', conj_domains)
    assign_resample_kernel = (
        'assign_resample',
        {idx: {'m': 10} for idx in nonconj_domains}
    )
    theta_kernel = (
        'theta',
        {'tparams': {idx: {'p': 0.1} for idx in nonconj_inds}}
    )

    kernels = []
    if conj_domains:
        kernels.append(assign_kernel)
    if nonconj_domains:
        kernels.append(assign_resample_kernel)
        kernels.append(theta_kernel)
    return kernels
示例#12
0
def toy_dataset_and_states(defn, states=5, avglen=100, numobs=100):
    """Create a toy dataset for evaluating HMM inference, return
  the data as well as the latent state sequence.

  Parameters
  ----------
  defn:   model definition
  states: number of latent states
  avlen:  average length of one observation sequence
    (actual length is sampled from a poisson distribution)
  numobs: number of observation sequences

  Output
  ------
  data:   the observations generated from the HMM
  states: the corresponding latent state sequence
  """

    validator.validate_type(defn, model_definition, 'defn')
    obs_mat, trans_mat = toy_model(defn, states)
    return gen_data(trans_mat, obs_mat, avglen, numobs)
示例#13
0
def toy_dataset_and_states(defn, states=5, avglen=100, numobs=100):
  """Create a toy dataset for evaluating HMM inference, return
  the data as well as the latent state sequence.

  Parameters
  ----------
  defn:   model definition
  states: number of latent states
  avlen:  average length of one observation sequence
    (actual length is sampled from a poisson distribution)
  numobs: number of observation sequences

  Output
  ------
  data:   the observations generated from the HMM
  states: the corresponding latent state sequence
  """

  validator.validate_type(defn, model_definition, 'defn')
  obs_mat, trans_mat = toy_model(defn, states)
  return gen_data(trans_mat, obs_mat, avglen, numobs)
示例#14
0
def default_cluster_hp_kernel_config(defn):
    """Creates a default kernel configuration for sampling the clustering
    (Chinese Restaurant Process) model hyper-parameter. The default kernel is
    currently a one-dimensional slice sampler.

    Parameters
    ----------
    defn : irm definition
        The hyper-priors set in the definition are used to configure the
        hyper-parameter sampling kernels.
    """
    validator.validate_type(defn, model_definition, 'defn')
    config = {}
    for i, hp in enumerate(defn.domain_hyperpriors()):
        if not hp:
            continue
        cparam = {k: (fn, 0.1) for k, fn in hp.iteritems()}
        config[i] = {'cparam': cparam}
    if not config:
        return []
    else:
        return [('slice_cluster_hp', config)]
示例#15
0
def default_relation_hp_kernel_config(defn):
    """Creates a default kernel configuration for sampling the component
    (feature) model hyper-parameters. The default kernel is currently
    a one-dimensional slice sampler.

    Parameters
    ----------
    defn : irm definition
        The hyper-priors set in the definition are used to configure the
        hyper-parameter sampling kernels.

    """
    validator.validate_type(defn, model_definition, 'defn')
    hparams = {}
    for i, hp in enumerate(defn.relation_hyperpriors()):
        if not hp:
            continue
        # XXX(stephentu): we are arbitrarily picking w=0.1
        hparams[i] = {k: (fn, 0.1) for k, fn in hp.iteritems()}
    if not hparams:
        return []
    else:
        return [('slice_relation_hp', {'hparams': hparams})]
示例#16
0
    def run(self, r, niters=10000):
        """Run the specified kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        for _ in xrange(niters):
            # This goes against every object-oriented bone in my body, but the interface must be satisfied
            # And actually Python won't even let me do this because I'm accessing a method in a C++ class...
            # I'd have to write this whole thing in Cython or change the state interface to expose all these
            # functions separately...which might actually be worth doing.
            self._latent._thisptr.get()[0].sample_aux()
            self._latent._thisptr.get()[0].sample_state()
            self._latent._thisptr.get()[0].clear_empty_states()
            self._latent._thisptr.get()[0].sample_hypers(20)
            self._latent._thisptr.get()[0].sample_pi()
            self._latent._thisptr.get()[0].sample_phi()
示例#17
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'crf':
                    lda_crp_gibbs(self._latent, r)
                elif name == 'direct_base_dp_hp':
                    sample_gamma(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_second_dp_hp':
                    sample_alpha(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_vocab_hp':
                    raise NotImplementedError('direct_vocab_hp not yet implemented')
                else:
                    raise ValueError("Bad kernel specification {}".format(name))
示例#18
0
文件: runner.py 项目: jzf2101/lda
    def __init__(self, defn, view, latent, kernel_config):
        validator.validate_type(defn, model_definition, 'defn')
        validator.validate_type(view, abstract_dataview, 'view')
        validator.validate_type(latent, state, 'latent')

        self._defn = defn
        self._view = view
        #self._latent = copy.deepcopy(latent)
        # XXX(stephentu): make copy work
        self._latent = latent

        self._kernel_config = []
        for kernel in kernel_config:
            name, config = kernel
            validator.validate_dict_like(config)
        if name == 'assign':
            pass
        else:
            raise ValueError("bad kernel found: {}".format(name))

        self._kernel_config.append((name, config))
示例#19
0
文件: runner.py 项目: jzf2101/lda
    def __init__(self, defn, view, latent, kernel_config):
        validator.validate_type(defn, model_definition, 'defn')
        validator.validate_type(view, abstract_dataview, 'view')
        validator.validate_type(latent, state, 'latent')

        self._defn = defn
        self._view = view
        #self._latent = copy.deepcopy(latent)
        # XXX(stephentu): make copy work
        self._latent = latent

        self._kernel_config = []
        for kernel in kernel_config:
            name, config = kernel
            validator.validate_dict_like(config)
        if name == 'assign':
            pass
        else:
            raise ValueError("bad kernel found: {}".format(name))

        self._kernel_config.append((name, config))
示例#20
0
    def __init__(self, defn, views, latent, kernel_config):
        validator.validate_type(defn, model_definition, 'defn')
        validator.validate_len(views, len(defn.relations()), 'views')
        for view in views:
            validator.validate_type(view, abstract_dataview)
        validator.validate_type(latent, state, 'latent')

        self._defn = defn
        self._views = views
        self._latent = copy.deepcopy(latent)

        self._kernel_config = []
        for kernel in kernel_config:
            name, config = kernel

            if not hasattr(config, 'iteritems'):
                config = {c: {} for c in config}
            validator.validate_dict_like(config)

            def require_relation_keys(config):
                valid_keys = set(xrange(len(defn.relations())))
                if not set(config.keys()).issubset(valid_keys):
                    raise ValueError("bad config found: {}".format(config))

            def require_domain_keys(config):
                valid_keys = set(xrange(len(defn.domains())))
                if not set(config.keys()).issubset(valid_keys):
                    raise ValueError("bad config found: {}".format(config))

            if name == 'assign':
                require_domain_keys(config)
                for v in config.values():
                    validator.validate_dict_like(v)
                    if v:
                        msg = "assign has no config params: {}".format(v)
                        raise ValueError(msg)

            elif name == 'assign_resample':
                require_domain_keys(config)
                for v in config.values():
                    validator.validate_dict_like(v)
                    if v.keys() != ['m']:
                        raise ValueError("bad config found: {}".format(v))

            elif name == 'slice_cluster_hp':
                require_domain_keys(config)
                for v in config.values():
                    validator.validate_dict_like(v)
                    if v.keys() != ['cparam']:
                        raise ValueError("bad config found: {}".format(v))

            elif name == 'grid_relation_hp':
                require_relation_keys(config)
                for ri, ps in config.iteritems():
                    if set(ps.keys()) != set(('hpdf', 'hgrid',)):
                        raise ValueError("bad config found: {}".format(ps))
                    full = []
                    for partial in ps['hgrid']:
                        hp = latent.get_relation_hp(ri)
                        hp.update(partial)
                        full.append(hp)
                    ps['hgrid'] = full

            elif name == 'slice_relation_hp':
                if config.keys() != ['hparams']:
                    raise ValueError("bad config found: {}".format(config))
                validator.validate_dict_like(config['hparams'])
                require_relation_keys(config['hparams'])

            elif name == 'theta':
                if config.keys() != ['tparams']:
                    raise ValueError("bad config found: {}".format(config))
                validator.validate_dict_like(config['tparams'])
                require_relation_keys(config['tparams'])

            else:
                raise ValueError("bad kernel found: {}".format(name))

            self._kernel_config.append((name, config))
示例#21
0
    def __init__(self, defn, view, latent, kernel_config):
        defn = _validate_definition(defn)
        validator.validate_type(view, abstract_dataview, param_name='view')
        if not isinstance(latent, state):
            raise ValueError("bad latent given")
        validator.validate_len(view, defn.n())

        def require_feature_indices(v):
            nfeatures = len(defn.models())
            valid_keys = set(xrange(nfeatures))
            if not set(v.keys()).issubset(valid_keys):
                msg = "bad config found: {}".format(v)
                raise ValueError(msg)

        self._defn = defn
        self._view = view
        self._latent = copy.deepcopy(latent)

        self._kernel_config = []
        for kernel in kernel_config:

            if hasattr(kernel, '__iter__'):
                name, config = kernel
            else:
                name, config = kernel, {}
            validator.validate_dict_like(config)

            if name == 'assign':
                if config:
                    raise ValueError("assign has no parameters")

            elif name == 'assign_resample':
                if config.keys() != ['m']:
                    raise ValueError("bad config found: {}".format(config))
                validator.validate_positive(config['m'])

            elif name == 'grid_feature_hp':
                require_feature_indices(config)
                for fi, ps in config.iteritems():
                    if set(ps.keys()) != set(('hpdf', 'hgrid',)):
                        raise ValueError("bad config found: {}".format(ps))
                    full = []
                    for partial in ps['hgrid']:
                        hp = latent.get_feature_hp(fi)
                        hp.update(partial)
                        full.append(hp)
                    ps['hgrid'] = full

            elif name == 'slice_feature_hp':
                if config.keys() != ['hparams']:
                    raise ValueError("bad config found: {}".format(config))
                require_feature_indices(config['hparams'])

            elif name == 'slice_cluster_hp':
                if config.keys() != ['cparam']:
                    raise ValueError("bad config found: {}".format(config))
                if config['cparam'].keys() != ['alpha']:
                    msg = "bad config found: {}".format(config['cparam'])
                    raise ValueError(msg)

            elif name == 'theta':
                if config.keys() != ['tparams']:
                    raise ValueError("bad config found: {}".format(config))
                require_feature_indices(config['tparams'])

            else:
                raise ValueError("bad kernel found: {}".format(name))

            self._kernel_config.append((name, config))
示例#22
0
def test_validate_type():
    obj = "abc"
    V.validate_type(obj, str)
    assert_raises(ValueError, V.validate_type, obj, dict)
示例#23
0
def test_validate_type():
    obj = "abc"
    V.validate_type(obj, str)
    assert_raises(ValueError, V.validate_type, obj, dict)