def run(self, r, niters=10000): """Run the specified mixturemodel kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') doc_model = bind(self._latent, data=self._view) for _ in xrange(niters): for name, config in self._kernel_config: if name == 'assign': assign2(doc_model, r) tabel_models = [ bind(self._latent, document=did) for did in xrange(self._latent.nentities()) ] for table_model in tabel_models: assign(table_model, r) else: assert False, 'should not be reached'
def run(self, r, niters=10000): """Run the specified mixturemodel kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') inds = xrange(len(self._defn.domains())) models = [bind(self._latent, i, self._views) for i in inds] for _ in xrange(niters): for name, config in self._kernel_config: if name == 'assign': for idx in config.keys(): gibbs.assign(models[idx], r) elif name == 'assign_resample': for idx, v in config.iteritems(): gibbs.assign_resample(models[idx], v['m'], r) elif name == 'slice_cluster_hp': for idx, v in config.iteritems(): slice.hp(models[idx], r, cparam=v['cparam']) elif name == 'grid_relation_hp': gibbs.hp(models[0], config, r) elif name == 'slice_relation_hp': slice.hp(models[0], r, hparams=config['hparams']) elif name == 'theta': slice.theta(models[0], r, tparams=config['tparams']) else: assert False, "should not be reached"
def run(self, r, niters=10000): """Run the specified mixturemodel kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') model = bind(self._latent, self._view) for _ in xrange(niters): for name, config in self._kernel_config: if name == 'assign': gibbs.assign(model, r) elif name == 'assign_resample': gibbs.assign_resample(model, config['m'], r) elif name == 'grid_feature_hp': gibbs.hp(model, config, r) elif name == 'slice_feature_hp': slice.hp(model, r, hparams=config['hparams']) elif name == 'slice_cluster_hp': slice.hp(model, r, cparam=config['cparam']) elif name == 'theta': slice.theta(model, r, tparams=config['tparams']) else: assert False, "should not be reach"
def run(self, r, niters=10000): """Run the lda kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): for name, config in self._kernel_config: if name == 'crf': lda_crp_gibbs(self._latent, r) elif name == 'direct_base_dp_hp': sample_gamma(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_second_dp_hp': sample_alpha(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_vocab_hp': raise NotImplementedError( 'direct_vocab_hp not yet implemented') else: raise ValueError( "Bad kernel specification {}".format(name))
def run(self, r, niters=10000): """Run each runner for `niters`, using the backend supplied in the constructor for parallelism. Parameters ---------- r : rng niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') if self._backend == 'multiprocessing': pool = mp.Pool(processes=self._processes) args = [(runner, niters, r.next(), None) for runner in self._runners] # map_async() + get() allows us to workaround a bug where # control-C doesn't kill multiprocessing workers self._runners = pool.map_async(_mp_work, args).get(10000000) pool.close() pool.join() elif self._backend == 'multyvac': # XXX(stephentu): the only parallelism strategy thus far is every # runner gets a dedicated core (multicore=1) on a machine jids = [] has_volume = bool(self._volume) zipped = zip(self._runners, self._digests) expensive_states = [] for i, (runner, digest) in enumerate(zipped): if has_volume: statearg = (self._volume, 'state-{}'.format(digest)) expensive_states.append(runner.expensive_state) runner.expensive_state = None else: statearg = None args = (runner, niters, r.next(), statearg) jids.append( multyvac.submit( _mp_work, args, _ignore_module_dependencies=True, _layer=self._layer, _vol=self._volume, _env=dict(self._env), # submit() mutates the env _core=self._core, _name='kernels-parallel-runner-{}'.format(i))) self._runners = [multyvac.get(jid).get_result() for jid in jids] if not expensive_states: return for runner, state in zip(self._runners, expensive_states): runner.expensive_state = state else: assert False, 'should not be reached'
def __init__(self, defn, view, latent, kernel_config): validator.validate_type(defn, model_definition, 'defn') # validator.validate_type(view, abstract_dataview, 'view') # for now, view is actually a list of lists validator.validate_type(latent, state, 'latent') self._defn = defn self._view = view self._latent = latent self._kernel_config = [] for kernel in kernel_config: if hasattr(kernel, '__iter__'): name, config = kernel else: name, config = kernel, {} validator.validate_dict_like(config) if name == 'beam': pass elif name == 'hypers': if 'alpha' in config: assert 'alpha_a' not in config and 'alpha_b' not in config alpha = config['alpha'] assert alpha > 0 latent.fix_alpha(alpha) elif 'alpha_a' in config and 'alpha_b' in config: assert 'alpha' not in config alpha_a = config['alpha_a'] alpha_b = config['alpha_b'] assert alpha_a > 0 and alpha_b > 0 latent.set_alpha_hypers(alpha_a, alpha_b) else: raise ValueError("Configuration missing parameters for alpha0") if 'gamma' in config: assert 'gamma_a' not in config and 'gamma_b' not in config gamma = config['gamma'] assert gamma > 0 latent.fix_gamma(gamma) elif 'gamma_a' in config and 'gamma_b' in config: assert 'gamma' not in config gamma_a = config['gamma_a'] gamma_b = config['gamma_b'] assert gamma_a > 0 and gamma_b > 0 latent.set_gamma_hypers(gamma_a, gamma_b) else: raise ValueError("Configuration missing parameters for gamma") else: raise ValueError("bad kernel found: {}".format(name)) self._kernel_config.append((name, config))
def run(self, r, niters=10000): """Run the lda kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): lda_crp_gibbs(self._latent, r)
def default_assign_kernel_config(defn): """Creates a default kernel configuration for sampling the assignment (clustering) vector for every domain. The default kernel is currently a gibbs sampler. Parameters ---------- defn : irm definition """ validator.validate_type(defn, model_definition, 'defn') # XXX(stephentu): model_descriptors should implement # is_conjugate() def is_nonconj(x): return x.name() == 'bbnc' conj_inds, nonconj_inds = [], [] for idx, m in enumerate(defn.relation_models()): lst = nonconj_inds if is_nonconj(m) else conj_inds lst.append(idx) nonconj_domains = set() for idx in nonconj_inds: for did in defn.relations()[idx]: nonconj_domains.add(did) conj_domains = [did for did in xrange(len(defn.domains())) if did not in nonconj_domains] nonconj_domains = list(nonconj_domains) assign_kernel = ('assign', conj_domains) assign_resample_kernel = ( 'assign_resample', {idx: {'m': 10} for idx in nonconj_domains} ) theta_kernel = ( 'theta', {'tparams': {idx: {'p': 0.1} for idx in nonconj_inds}} ) kernels = [] if conj_domains: kernels.append(assign_kernel) if nonconj_domains: kernels.append(assign_resample_kernel) kernels.append(theta_kernel) return kernels
def toy_dataset_and_states(defn, states=5, avglen=100, numobs=100): """Create a toy dataset for evaluating HMM inference, return the data as well as the latent state sequence. Parameters ---------- defn: model definition states: number of latent states avlen: average length of one observation sequence (actual length is sampled from a poisson distribution) numobs: number of observation sequences Output ------ data: the observations generated from the HMM states: the corresponding latent state sequence """ validator.validate_type(defn, model_definition, 'defn') obs_mat, trans_mat = toy_model(defn, states) return gen_data(trans_mat, obs_mat, avglen, numobs)
def default_cluster_hp_kernel_config(defn): """Creates a default kernel configuration for sampling the clustering (Chinese Restaurant Process) model hyper-parameter. The default kernel is currently a one-dimensional slice sampler. Parameters ---------- defn : irm definition The hyper-priors set in the definition are used to configure the hyper-parameter sampling kernels. """ validator.validate_type(defn, model_definition, 'defn') config = {} for i, hp in enumerate(defn.domain_hyperpriors()): if not hp: continue cparam = {k: (fn, 0.1) for k, fn in hp.iteritems()} config[i] = {'cparam': cparam} if not config: return [] else: return [('slice_cluster_hp', config)]
def default_relation_hp_kernel_config(defn): """Creates a default kernel configuration for sampling the component (feature) model hyper-parameters. The default kernel is currently a one-dimensional slice sampler. Parameters ---------- defn : irm definition The hyper-priors set in the definition are used to configure the hyper-parameter sampling kernels. """ validator.validate_type(defn, model_definition, 'defn') hparams = {} for i, hp in enumerate(defn.relation_hyperpriors()): if not hp: continue # XXX(stephentu): we are arbitrarily picking w=0.1 hparams[i] = {k: (fn, 0.1) for k, fn in hp.iteritems()} if not hparams: return [] else: return [('slice_relation_hp', {'hparams': hparams})]
def run(self, r, niters=10000): """Run the specified kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): # This goes against every object-oriented bone in my body, but the interface must be satisfied # And actually Python won't even let me do this because I'm accessing a method in a C++ class... # I'd have to write this whole thing in Cython or change the state interface to expose all these # functions separately...which might actually be worth doing. self._latent._thisptr.get()[0].sample_aux() self._latent._thisptr.get()[0].sample_state() self._latent._thisptr.get()[0].clear_empty_states() self._latent._thisptr.get()[0].sample_hypers(20) self._latent._thisptr.get()[0].sample_pi() self._latent._thisptr.get()[0].sample_phi()
def run(self, r, niters=10000): """Run the lda kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): for name, config in self._kernel_config: if name == 'crf': lda_crp_gibbs(self._latent, r) elif name == 'direct_base_dp_hp': sample_gamma(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_second_dp_hp': sample_alpha(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_vocab_hp': raise NotImplementedError('direct_vocab_hp not yet implemented') else: raise ValueError("Bad kernel specification {}".format(name))
def __init__(self, defn, view, latent, kernel_config): validator.validate_type(defn, model_definition, 'defn') validator.validate_type(view, abstract_dataview, 'view') validator.validate_type(latent, state, 'latent') self._defn = defn self._view = view #self._latent = copy.deepcopy(latent) # XXX(stephentu): make copy work self._latent = latent self._kernel_config = [] for kernel in kernel_config: name, config = kernel validator.validate_dict_like(config) if name == 'assign': pass else: raise ValueError("bad kernel found: {}".format(name)) self._kernel_config.append((name, config))
def __init__(self, defn, views, latent, kernel_config): validator.validate_type(defn, model_definition, 'defn') validator.validate_len(views, len(defn.relations()), 'views') for view in views: validator.validate_type(view, abstract_dataview) validator.validate_type(latent, state, 'latent') self._defn = defn self._views = views self._latent = copy.deepcopy(latent) self._kernel_config = [] for kernel in kernel_config: name, config = kernel if not hasattr(config, 'iteritems'): config = {c: {} for c in config} validator.validate_dict_like(config) def require_relation_keys(config): valid_keys = set(xrange(len(defn.relations()))) if not set(config.keys()).issubset(valid_keys): raise ValueError("bad config found: {}".format(config)) def require_domain_keys(config): valid_keys = set(xrange(len(defn.domains()))) if not set(config.keys()).issubset(valid_keys): raise ValueError("bad config found: {}".format(config)) if name == 'assign': require_domain_keys(config) for v in config.values(): validator.validate_dict_like(v) if v: msg = "assign has no config params: {}".format(v) raise ValueError(msg) elif name == 'assign_resample': require_domain_keys(config) for v in config.values(): validator.validate_dict_like(v) if v.keys() != ['m']: raise ValueError("bad config found: {}".format(v)) elif name == 'slice_cluster_hp': require_domain_keys(config) for v in config.values(): validator.validate_dict_like(v) if v.keys() != ['cparam']: raise ValueError("bad config found: {}".format(v)) elif name == 'grid_relation_hp': require_relation_keys(config) for ri, ps in config.iteritems(): if set(ps.keys()) != set(('hpdf', 'hgrid',)): raise ValueError("bad config found: {}".format(ps)) full = [] for partial in ps['hgrid']: hp = latent.get_relation_hp(ri) hp.update(partial) full.append(hp) ps['hgrid'] = full elif name == 'slice_relation_hp': if config.keys() != ['hparams']: raise ValueError("bad config found: {}".format(config)) validator.validate_dict_like(config['hparams']) require_relation_keys(config['hparams']) elif name == 'theta': if config.keys() != ['tparams']: raise ValueError("bad config found: {}".format(config)) validator.validate_dict_like(config['tparams']) require_relation_keys(config['tparams']) else: raise ValueError("bad kernel found: {}".format(name)) self._kernel_config.append((name, config))
def __init__(self, defn, view, latent, kernel_config): defn = _validate_definition(defn) validator.validate_type(view, abstract_dataview, param_name='view') if not isinstance(latent, state): raise ValueError("bad latent given") validator.validate_len(view, defn.n()) def require_feature_indices(v): nfeatures = len(defn.models()) valid_keys = set(xrange(nfeatures)) if not set(v.keys()).issubset(valid_keys): msg = "bad config found: {}".format(v) raise ValueError(msg) self._defn = defn self._view = view self._latent = copy.deepcopy(latent) self._kernel_config = [] for kernel in kernel_config: if hasattr(kernel, '__iter__'): name, config = kernel else: name, config = kernel, {} validator.validate_dict_like(config) if name == 'assign': if config: raise ValueError("assign has no parameters") elif name == 'assign_resample': if config.keys() != ['m']: raise ValueError("bad config found: {}".format(config)) validator.validate_positive(config['m']) elif name == 'grid_feature_hp': require_feature_indices(config) for fi, ps in config.iteritems(): if set(ps.keys()) != set(('hpdf', 'hgrid',)): raise ValueError("bad config found: {}".format(ps)) full = [] for partial in ps['hgrid']: hp = latent.get_feature_hp(fi) hp.update(partial) full.append(hp) ps['hgrid'] = full elif name == 'slice_feature_hp': if config.keys() != ['hparams']: raise ValueError("bad config found: {}".format(config)) require_feature_indices(config['hparams']) elif name == 'slice_cluster_hp': if config.keys() != ['cparam']: raise ValueError("bad config found: {}".format(config)) if config['cparam'].keys() != ['alpha']: msg = "bad config found: {}".format(config['cparam']) raise ValueError(msg) elif name == 'theta': if config.keys() != ['tparams']: raise ValueError("bad config found: {}".format(config)) require_feature_indices(config['tparams']) else: raise ValueError("bad kernel found: {}".format(name)) self._kernel_config.append((name, config))
def test_validate_type(): obj = "abc" V.validate_type(obj, str) assert_raises(ValueError, V.validate_type, obj, dict)