def initialize(self, **model_params): """ `model_params` are parameters used to initialize individual workers (gets handed all the way down to `worker.initialize()`). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} import Pyro4 with utils.getNS() as ns: self.callback = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') # = self self.callback._pyroOneway.add("jobdone") # make sure workers transfer control back to dispatcher asynchronously for name, uri in ns.list(prefix='gensim.lda_worker').iteritems(): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously worker._pyroOneway.add("requestjob") worker._pyroOneway.add("exit") logger.info("registering worker #%i at %s" % (workerid, uri)) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.warning("unresponsive worker at %s, deleting it from the name server" % uri) ns.remove(name) if not self.workers: raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!')
def initialize(self, **model_params): """ `model_params` are parameters used to initialize individual workers (gets handed all the way down to worker.initialize()). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS() as ns: import Pyro4 self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self self.callback._pyroOneway.add("jobdone") # make sure workers transfer control back to dispatcher asynchronously for name, uri in ns.list(prefix='gensim.lsi_worker').iteritems(): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously worker._pyroOneway.add("requestjob") worker._pyroOneway.add("exit") logger.info("registering worker #%i from %s" % (workerid, uri)) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker worker.requestjob() except Pyro4.errors.PyroError, err: logger.exception("unresponsive worker at %s, deleting it from the name server" % uri) ns.remove(name)
def initialize(self, **model_params): """ `model_params` are parameters used to initialize individual workers (gets handed all the way down to `worker.initialize()`). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS(**self.ns_conf) as ns: self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously logger.info("registering worker #%i at %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.warning("unresponsive worker at %s, deleting it from the name server", uri) ns.remove(name) if not self.workers: raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!')
def initialize(self, **model_params): """ `model_params` are parameters used to initialize individual workers (gets handed all the way down to `worker.initialize()`). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS(**self.ns_conf) as ns: self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously logger.info("registering worker #%i at %s" % (workerid, uri)) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.warning("unresponsive worker at %s, deleting it from the name server" % uri) ns.remove(name) if not self.workers: raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!')
def initialize(self, **model_params): """Fully initialize the dispatcher and all its workers. Parameters ---------- **model_params Keyword parameters used to initialize individual workers (gets handed all the way down to :meth:`gensim.models.lsi_worker.Worker.initialize`). See :class:`~gensim.models.lsimodel.LsiModel`. Raises ------ RuntimeError When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS() as ns: self.callback = Pyro4.Proxy( 'PYRONAME:gensim.lsi_dispatcher') # = self for name, uri in iteritems(ns.list(prefix='gensim.lsi_worker')): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously logger.info("registering worker #%i from %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.exception( "unresponsive worker at %s, deleting it from the name server", uri) ns.remove(name) if not self.workers: raise RuntimeError( 'no workers found; run some lsi_worker scripts on your machines first!' )
def initialize(self, **model_params): """Fully initializes the dispatcher and all its workers. Parameters ---------- **model_params Keyword parameters used to initialize individual workers, see :class:`~gensim.models.ldamodel.LdaModel`. Raises ------ RuntimeError When no workers are found (the :mod:`gensim.models.lda_worker` script must be ran beforehand). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 self.workers = {} with utils.getNS(**self.ns_conf) as ns: self.callback = Pyro4.Proxy( ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously logger.info("registering worker #%i at %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.warning( "unresponsive worker at %s,deleting it from the name server", uri) ns.remove(name) if not self.workers: raise RuntimeError( 'no workers found; run some lda_worker scripts on your machines first!' )
def initialize(self, **model_params): """Fully initialize the dispatcher and all its workers. Parameters ---------- **model_params Keyword parameters used to initialize individual workers (gets handed all the way down to :meth:`gensim.models.lsi_worker.Worker.initialize`). See :class:`~gensim.models.lsimodel.LsiModel`. Raises ------ RuntimeError When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS() as ns: self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self for name, uri in iteritems(ns.list(prefix='gensim.lsi_worker')): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously logger.info("registering worker #%i from %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.exception("unresponsive worker at %s, deleting it from the name server", uri) ns.remove(name) if not self.workers: raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!')
def initialize(self, **model_params): """Fully initialize the dispatcher and all its workers. Parameters ---------- **model_params Keyword parameters used to initialize individual workers, see :class:`~gensim.models.ldamodel.LdaModel`. Raises ------ RuntimeError When no workers are found (the :mod:`gensim.models.lda_worker` script must be ran beforehand). """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 self.workers = {} with utils.getNS(**self.ns_conf) as ns: self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously logger.info("registering worker #%i at %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: logger.warning("unresponsive worker at %s,deleting it from the name server", uri) ns.remove(name) if not self.workers: raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!')
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis. This may be useful if you want to seed certain topics with particular words by boosting the priors for those words. It also supports the special value 'auto', which learns an asymmetric prior directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a numpy.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = get_random_state(random_state) assert ( self.eta.shape == (self.num_topics, 1) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_topics, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError( "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy( ns.list(prefix=LDA_DISPATCHER_PREFIX) [LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis. This may be useful if you want to seed certain topics with particular words by boosting the priors for those words. It also supports the special value 'auto', which learns an asymmetric prior directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a numpy.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = get_random_state(random_state) assert (self.eta.shape == (self.num_topics, 1) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_topics, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)