def run(self, r, niters=10000): """Run the specified mixturemodel kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') inds = xrange(len(self._defn.domains())) models = [bind(self._latent, i, self._views) for i in inds] for _ in xrange(niters): for name, config in self._kernel_config: if name == 'assign': for idx in config.keys(): gibbs.assign(models[idx], r) elif name == 'assign_resample': for idx, v in config.iteritems(): gibbs.assign_resample(models[idx], v['m'], r) elif name == 'slice_cluster_hp': for idx, v in config.iteritems(): slice.hp(models[idx], r, cparam=v['cparam']) elif name == 'grid_relation_hp': gibbs.hp(models[0], config, r) elif name == 'slice_relation_hp': slice.hp(models[0], r, hparams=config['hparams']) elif name == 'theta': slice.theta(models[0], r, tparams=config['tparams']) else: assert False, "should not be reached"
def run(self, r, niters=10000): """Run the lda kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): for name, config in self._kernel_config: if name == 'crf': lda_crp_gibbs(self._latent, r) elif name == 'direct_base_dp_hp': sample_gamma(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_second_dp_hp': sample_alpha(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_vocab_hp': raise NotImplementedError( 'direct_vocab_hp not yet implemented') else: raise ValueError( "Bad kernel specification {}".format(name))
def run(self, r, niters=10000): """Run the specified mixturemodel kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') doc_model = bind(self._latent, data=self._view) for _ in xrange(niters): for name, config in self._kernel_config: if name == 'assign': assign2(doc_model, r) tabel_models = [ bind(self._latent, document=did) for did in xrange(self._latent.nentities()) ] for table_model in tabel_models: assign(table_model, r) else: assert False, 'should not be reached'
def run(self, r, niters=10000): """Run the specified mixturemodel kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') model = bind(self._latent, self._view) for _ in xrange(niters): for name, config in self._kernel_config: if name == 'assign': gibbs.assign(model, r) elif name == 'assign_resample': gibbs.assign_resample(model, config['m'], r) elif name == 'grid_feature_hp': gibbs.hp(model, config, r) elif name == 'slice_feature_hp': slice.hp(model, r, hparams=config['hparams']) elif name == 'slice_cluster_hp': slice.hp(model, r, cparam=config['cparam']) elif name == 'theta': slice.theta(model, r, tparams=config['tparams']) else: assert False, "should not be reach"
def run(self, r, niters=10000): """Run each runner for `niters`, using the backend supplied in the constructor for parallelism. Parameters ---------- r : rng niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') if self._backend == 'multiprocessing': pool = mp.Pool(processes=self._processes) args = [(runner, niters, r.next(), None) for runner in self._runners] # map_async() + get() allows us to workaround a bug where # control-C doesn't kill multiprocessing workers self._runners = pool.map_async(_mp_work, args).get(10000000) pool.close() pool.join() elif self._backend == 'multyvac': # XXX(stephentu): the only parallelism strategy thus far is every # runner gets a dedicated core (multicore=1) on a machine jids = [] has_volume = bool(self._volume) zipped = zip(self._runners, self._digests) expensive_states = [] for i, (runner, digest) in enumerate(zipped): if has_volume: statearg = (self._volume, 'state-{}'.format(digest)) expensive_states.append(runner.expensive_state) runner.expensive_state = None else: statearg = None args = (runner, niters, r.next(), statearg) jids.append( multyvac.submit( _mp_work, args, _ignore_module_dependencies=True, _layer=self._layer, _vol=self._volume, _env=dict(self._env), # submit() mutates the env _core=self._core, _name='kernels-parallel-runner-{}'.format(i))) self._runners = [multyvac.get(jid).get_result() for jid in jids] if not expensive_states: return for runner, state in zip(self._runners, expensive_states): runner.expensive_state = state else: assert False, 'should not be reached'
def run(self, r, niters=10000): """Run the lda kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): lda_crp_gibbs(self._latent, r)
def posterior_predictive(q, latents, r, samples_per_chain=1): """Generate a bag of samples from the posterior distribution of each mixturemodel state object. Parameters ---------- q : (N,) masked recarray The query object latents : list of mixturemodel latent objects r : random state samples_per_chain : int, optional Default is 1. Returns ------- samples : (N, M) recarray where ``M = len(latents) * samples_per_chain`` Notes ----- If ``N=1``, the resultng `samples` will *not* be collasped into a (M,) shape recarray for consistency purposes. """ if len(q.shape) != 1: raise ValueError("1d masked recarrays only") if not len(latents): raise ValueError("no latents given") validator.validate_positive( samples_per_chain, param_name='samples_per_chain') def f(q): samples = [] for latent in latents: for _ in xrange(samples_per_chain): samples.append(latent.sample_post_pred(q, r)[1]) return np.hstack(samples) return np.array(map(f, q))
def run(self, r, niters=10000): """Run the specified kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): # This goes against every object-oriented bone in my body, but the interface must be satisfied # And actually Python won't even let me do this because I'm accessing a method in a C++ class... # I'd have to write this whole thing in Cython or change the state interface to expose all these # functions separately...which might actually be worth doing. self._latent._thisptr.get()[0].sample_aux() self._latent._thisptr.get()[0].sample_state() self._latent._thisptr.get()[0].clear_empty_states() self._latent._thisptr.get()[0].sample_hypers(20) self._latent._thisptr.get()[0].sample_pi() self._latent._thisptr.get()[0].sample_phi()
def run(self, r, niters=10000): """Run the lda kernel for `niters`, in a single thread. Parameters ---------- r : random state niters : int """ validator.validate_type(r, rng, param_name='r') validator.validate_positive(niters, param_name='niters') for _ in xrange(niters): for name, config in self._kernel_config: if name == 'crf': lda_crp_gibbs(self._latent, r) elif name == 'direct_base_dp_hp': sample_gamma(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_second_dp_hp': sample_alpha(self._latent, r, config['hp1'], config['hp2']) elif name == 'direct_vocab_hp': raise NotImplementedError('direct_vocab_hp not yet implemented') else: raise ValueError("Bad kernel specification {}".format(name))
def __init__(self, defn, view, latent, kernel_config): defn = _validate_definition(defn) validator.validate_type(view, abstract_dataview, param_name='view') if not isinstance(latent, state): raise ValueError("bad latent given") validator.validate_len(view, defn.n()) def require_feature_indices(v): nfeatures = len(defn.models()) valid_keys = set(xrange(nfeatures)) if not set(v.keys()).issubset(valid_keys): msg = "bad config found: {}".format(v) raise ValueError(msg) self._defn = defn self._view = view self._latent = copy.deepcopy(latent) self._kernel_config = [] for kernel in kernel_config: if hasattr(kernel, '__iter__'): name, config = kernel else: name, config = kernel, {} validator.validate_dict_like(config) if name == 'assign': if config: raise ValueError("assign has no parameters") elif name == 'assign_resample': if config.keys() != ['m']: raise ValueError("bad config found: {}".format(config)) validator.validate_positive(config['m']) elif name == 'grid_feature_hp': require_feature_indices(config) for fi, ps in config.iteritems(): if set(ps.keys()) != set(('hpdf', 'hgrid',)): raise ValueError("bad config found: {}".format(ps)) full = [] for partial in ps['hgrid']: hp = latent.get_feature_hp(fi) hp.update(partial) full.append(hp) ps['hgrid'] = full elif name == 'slice_feature_hp': if config.keys() != ['hparams']: raise ValueError("bad config found: {}".format(config)) require_feature_indices(config['hparams']) elif name == 'slice_cluster_hp': if config.keys() != ['cparam']: raise ValueError("bad config found: {}".format(config)) if config['cparam'].keys() != ['alpha']: msg = "bad config found: {}".format(config['cparam']) raise ValueError(msg) elif name == 'theta': if config.keys() != ['tparams']: raise ValueError("bad config found: {}".format(config)) require_feature_indices(config['tparams']) else: raise ValueError("bad kernel found: {}".format(name)) self._kernel_config.append((name, config))
def __init__(self, runners, backend='multiprocessing', **kwargs): self._runners = runners if backend not in ( 'multiprocessing', 'multyvac', ): raise ValueError("invalid backend: {}".format(backend)) self._backend = backend if backend == 'multiprocessing': validator.validate_kwargs(kwargs, ('processes', )) if 'processes' not in kwargs: kwargs['processes'] = mp.cpu_count() validator.validate_positive(kwargs['processes'], 'processes') self._processes = kwargs['processes'] elif backend == 'multyvac': if not _has_multyvac: raise ValueError("multyvac module not installed on machine") validator.validate_kwargs(kwargs, ( 'layer', 'core', 'volume', )) if 'layer' not in kwargs: msg = ('multyvac support requires setting up a layer.' 'see scripts in bin') raise ValueError(msg) self._volume = kwargs.get('volume', None) if self._volume is None: msg = "use of a volume is highly recommended" warnings.warn(msg) else: volume = multyvac.volume.get(self._volume) if not volume: raise ValueError("no such volume: {}".format(self._volume)) self._layer = kwargs['layer'] if (not multyvac.config.api_key or not multyvac.config.api_secret_key): raise ValueError("multyvac is not auth-ed") # XXX(stephentu): currently defaults to the good stuff self._core = kwargs.get('core', 'f2') self._env = {} # XXX(stephentu): assumes you used the setup multyvac scripts we # provide self._env['PATH'] = '{}:{}'.format( '/home/multyvac/miniconda/envs/build/bin', _MULTYVAC_PATH) self._env['CONDA_DEFAULT_ENV'] = 'build' # this is needed for multyvacinit.pybootstrap self._env['PYTHONPATH'] = '/usr/local/lib/python2.7/dist-packages' # XXX(stephentu): multyvac post requests are limited in size # (don't know what the hard limit is). so to avoid the limits, # we explicitly serialize the expensive state to a file if not self._volume: # no volume provided for uploads self._digests = [None for _ in xrange(len(self._runners))] return # XXX(stephentu): we shouldn't reach in there like this self._digests = [] digest_cache = {} for runner in self._runners: cache_key = id(runner.expensive_state) if cache_key in digest_cache: digest = digest_cache[cache_key] else: h = hashlib.sha1() runner.expensive_state_digest(h) digest = h.hexdigest() digest_cache[cache_key] = digest self._digests.append(digest) uploaded = set(_mvac_list_files_in_dir(volume, "")) _logger.info("starting state uploads") start = time.time() for runner, digest in zip(self._runners, self._digests): if digest in uploaded: continue _logger.info("uploaded state-%s since not found", digest) f = tempfile.NamedTemporaryFile() pickle.dump(runner.expensive_state, f) f.flush() # XXX(stephentu) this seems to fail for large files #volume.put_file(f.name, 'state-{}'.format(digest)) volume.sync_up(f.name, 'state-{}'.format(digest)) f.close() uploaded.add(digest) _logger.info("state upload took %f seconds", (time.time() - start)) else: assert False, 'should not be reached'
def __init__(self, runners, backend='multiprocessing', **kwargs): self._runners = runners if backend not in ('multiprocessing', 'multyvac',): raise ValueError("invalid backend: {}".format(backend)) self._backend = backend if backend == 'multiprocessing': validator.validate_kwargs(kwargs, ('processes',)) if 'processes' not in kwargs: kwargs['processes'] = mp.cpu_count() validator.validate_positive(kwargs['processes'], 'processes') self._processes = kwargs['processes'] elif backend == 'multyvac': if not _has_multyvac: raise ValueError("multyvac module not installed on machine") validator.validate_kwargs(kwargs, ('layer', 'core', 'volume',)) if 'layer' not in kwargs: msg = ('multyvac support requires setting up a layer.' 'see scripts in bin') raise ValueError(msg) self._volume = kwargs.get('volume', None) if self._volume is None: msg = "use of a volume is highly recommended" warnings.warn(msg) else: volume = multyvac.volume.get(self._volume) if not volume: raise ValueError( "no such volume: {}".format(self._volume)) self._layer = kwargs['layer'] if (not multyvac.config.api_key or not multyvac.config.api_secret_key): raise ValueError("multyvac is not auth-ed") # XXX(stephentu): currently defaults to the good stuff self._core = kwargs.get('core', 'f2') self._env = {} # XXX(stephentu): assumes you used the setup multyvac scripts we # provide self._env['PATH'] = '{}:{}'.format( '/home/multyvac/miniconda/envs/build/bin', _MULTYVAC_PATH) self._env['CONDA_DEFAULT_ENV'] = 'build' # this is needed for multyvacinit.pybootstrap self._env['PYTHONPATH'] = '/usr/local/lib/python2.7/dist-packages' # XXX(stephentu): multyvac post requests are limited in size # (don't know what the hard limit is). so to avoid the limits, # we explicitly serialize the expensive state to a file if not self._volume: # no volume provided for uploads self._digests = [None for _ in xrange(len(self._runners))] return # XXX(stephentu): we shouldn't reach in there like this self._digests = [] digest_cache = {} for runner in self._runners: cache_key = id(runner.expensive_state) if cache_key in digest_cache: digest = digest_cache[cache_key] else: h = hashlib.sha1() runner.expensive_state_digest(h) digest = h.hexdigest() digest_cache[cache_key] = digest self._digests.append(digest) uploaded = set(_mvac_list_files_in_dir(volume, "")) _logger.info("starting state uploads") start = time.time() for runner, digest in zip(self._runners, self._digests): if digest in uploaded: continue _logger.info("uploaded state-%s since not found", digest) f = tempfile.NamedTemporaryFile() pickle.dump(runner.expensive_state, f) f.flush() # XXX(stephentu) this seems to fail for large files #volume.put_file(f.name, 'state-{}'.format(digest)) volume.sync_up(f.name, 'state-{}'.format(digest)) f.close() uploaded.add(digest) _logger.info("state upload took %f seconds", (time.time() - start)) else: assert False, 'should not be reached'