def __init__(self, source_lang_vec, target_lang_vec, tagged_docs=None, random_state=None): """ Parameters ---------- source_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` Source Doc2Vec model. target_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` Target Doc2Vec model. tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional. Documents that will be used for training, both the source language document vector and target language document vector trained on those tagged documents. random_state : {None, int, array_like}, optional Seed for random state. """ self.tagged_docs = tagged_docs self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None if tagged_docs is not None: self.train(tagged_docs)
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): """ Initialize the model from a list pair of `word_pair`. Each word_pair is tupe with source language word and target language word. Examples: [("one", "uno"), ("two", "due")] Args: `word_pair` (list): a list pair of words `source_lang_vec` (KeyedVectors): a set of word vector of source language `target_lang_vec` (KeyedVectors): a set of word vector of target language """ self.source_word = None self.target_word = None self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None self.source_space = None self.target_space = None if word_pairs is not None: if len(word_pairs[0]) != 2: raise ValueError("Each training data item must contain two different language words.") self.train(word_pairs)
def __init__(self, src_model, tgt_model, word_pairs=None, random_state=None): """ Initialize the model from a list pair of `word_pair`. Each word_pair is tupe with source language word and target language word. Examples: [("one", "uno"), ("two", "due")] Args: `word_pair` (list): a list pair of words `src_model` (Word2Vec): a word2vec model of source language `tgt_model` (Word2Vec): a word2vec model of target language """ self.source_word = None self.target_word = None self.src_model = src_model self.tgt_model = tgt_model self.src_model.init_sims() self.tgt_model.init_sims() # self.src_mat = normalize(src_model.wv.vectors) # self.tgt_mat = normalize(tgt_model.wv.vectors) self.random_state = utils.get_random_state(random_state) self.translation_matrix = None if word_pairs is not None: if len(word_pairs[0]) != 2: raise ValueError("Each training data item must contain two \ different language words.") self.train(word_pairs)
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): """ Parameters ---------- source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` Word vectors for source language. target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` Word vectors for target language. word_pairs : list of (str, str), optional Pairs of words that will be used for training. random_state : {None, int, array_like}, optional Seed for random state. """ self.source_word = None self.target_word = None self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None self.source_space = None self.target_space = None if word_pairs is not None: if len(word_pairs[0]) != 2: raise ValueError( "Each training data item must contain two different language words." ) self.train(word_pairs)
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) # check if `random_state` attribute has been set after main pickle load # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value if not hasattr(result, 'random_state'): result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` logging.warning("random_state not set so using default value") state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') # check if `id2word_fname` file is present on disk # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) return result
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): """ Parameters ---------- source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` Word vectors for source language. target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` Word vectors for target language. word_pairs : list of (str, str), optional Pairs of words that will be used for training. random_state : {None, int, array_like}, optional Seed for random state. """ self.source_word = None self.target_word = None self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None self.source_space = None self.target_space = None if word_pairs is not None: if len(word_pairs[0]) != 2: raise ValueError("Each training data item must contain two different language words.") self.train(word_pairs)
def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): """ Initialize the model from a list pair of `word_pair`. Each word_pair is tupe with source language word and target language word. Examples: [("one", "uno"), ("two", "due")] Args: `word_pair` (list): a list pair of words `source_lang_vec` (KeyedVectors): a set of word vector of source language `target_lang_vec` (KeyedVectors): a set of word vector of target language """ self.source_word = None self.target_word = None self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None self.source_space = None self.target_space = None if word_pairs is not None: if len(word_pairs[0]) != 2: raise ValueError( "Each training data item must contain two different language words." ) self.train(word_pairs)
def reparam_topics(old_model, ntopics, new_id2word, new_date_label): """return the sstats from an old model shoehorned into the shape of the new model if ntopics is smaller than the old model, drop the least frequent topics if it is larger, add new topics and seed with some small value map the old words onto the new terms return the new parameters and the mapping between old and new topics""" # initialize sstats of shape[len(new_id2word), ntopics] # self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) random_state = gensim_utils.get_random_state(None) new_sstats = random_state.gamma( 100., 1. / 100., (len(new_id2word), old_model.sstats.shape[1])) topics = old_model.topic_names # map common words between old and new vocabularies old_dict = {v: k for k, v in old_model.id2word.items()} new_dict = {v: k for k, v in new_id2word.items()} common_words = filter(old_dict.has_key, new_dict.keys()) common_words.sort() common_old_keys = [old_dict.get(i) for i in common_words] common_new_keys = [new_dict.get(i) for i in common_words] new_sstats[common_new_keys, :] = old_model.sstats[common_old_keys, :] # remove or add topics (columns) to_add = ntopics - new_sstats.shape[1] if to_add > 0: new_sstats = np.hstack( (new_sstats, random_state.gamma(100., 1. / 100., (len(new_id2word), to_add)))) new_topics = [new_date_label + ":" + str(i) for i in xrange(to_add)] topics = old_model.topic_names + new_topics elif to_add < 0: worst = old_model.topic_freq.iloc[old_model.topic_freq.shape[0] - 1, :].argsort()[range( -to_add)].tolist() new_sstats = np.delete(new_sstats, worst, axis=1) topics = [ i for j, i in enumerate(old_model.topic_names) if j not in worst ] # worst = old_model.topic_freq.argsort()[:to_add] # mask = np.delete(np.arange(new_sstats.shape[1]), worst) # new_sstats = new_sstats[:,mask] # check that sstats has shape[len(new_id2word), ntopics] assert new_sstats.shape[1] == ntopics assert new_sstats.shape[0] == len(new_id2word) return new_sstats, topics
def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=None): """ Initialize the model from a list of `tagged_docs`. Each word_pair is tupe with source language word and target language word. Examples: [("one", "uno"), ("two", "due")] Args: `tagged_docs` (list): a list of tagged document `source_lang_vec` (Doc2vec): provide the document vector `target_lang_vec` (Doc2vec): provide the document vector """ self.tagged_docs = tagged_docs self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None
def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=None): """ Parameters ---------- tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional Documents that will be used for training source_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` Source Doc2Vec model. target_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` Target Doc2Vec model. random_state : {None, int, array_like}, optional Seed for random state. """ self.tagged_docs = tagged_docs self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec self.random_state = utils.get_random_state(random_state) self.translation_matrix = None
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ `gamma`: first level concentration `alpha`: second level concentration `eta`: the topic Dirichlet `T`: top level truncation level `K`: second level truncation level `kappa`: learning rate `tau`: slow down parameter `max_time`: stop training after this many seconds `max_chunks`: stop after having processed this many chunks (wrap around corpus beginning in another corpus pass, if there are not enough chunks in the corpus) """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma( 1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a np.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute the author-topic model over an empty collection (no terms)") logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).") if serialized and serialization_path: assert not isfile(serialization_path), \ "A file already exists at the serialization_path path; " \ "choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) ) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ Parameters ---------- corpus : iterable of list of (int, float) Corpus in BoW format. id2word : :class:`~gensim.corpora.dictionary.Dictionary` Dictionary for the input corpus. max_chunks : int, optional Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass, if there are not enough chunks in the corpus. max_time : int, optional Upper bound on time (in seconds) for which model will be trained. chunksize : int, optional Number of documents in one chuck. kappa: float,optional Learning parameter which acts as exponential decay factor to influence extent of learning from each batch. tau: float, optional Learning parameter which down-weights early iterations of documents. K : int, optional Second level truncation level T : int, optional Top level truncation level alpha : int, optional Second level concentration gamma : int, optional First level concentration eta : float, optional The topic Dirichlet scale : float, optional Weights information from the mini-chunk of corpus to calculate rhot. var_converge : float, optional Lower bound on the right side of convergence. Used when updating variational parameters for a single document. outputdir : str, optional Stores topic and options information in the specified directory. random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional} Adds a little random jitter to randomize results around same alpha when trying to fetch a closest corresponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model` """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def __init__( self, corpus=None, num_topics=100, id2word=None, chunksize=2000, passes=1, kappa=1.0, minimum_probability=0.01, w_max_iter=200, w_stop_condition=1e-4, h_max_iter=50, h_stop_condition=1e-3, eval_every=10, normalize=True, random_state=None, ): r""" Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents), optional Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). num_topics : int, optional Number of topics to extract. id2word: {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`} Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing. chunksize: int, optional Number of documents to be used in each training chunk. passes: int, optional Number of full passes over the training corpus. Leave at default `passes=1` if your input is an iterator. kappa : float, optional Gradient descent step size. Larger value makes the model train faster, but could lead to non-convergence if set too large. minimum_probability: If `normalize` is True, topics with smaller probabilities are filtered out. If `normalize` is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s. w_max_iter: int, optional Maximum number of iterations to train W per each batch. w_stop_condition: float, optional If error difference gets less than that, training of ``W`` stops for the current batch. h_max_iter: int, optional Maximum number of iterations to train h per each batch. h_stop_condition: float If error difference gets less than that, training of ``h`` stops for the current batch. eval_every: int, optional Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low. normalize: bool or None, optional Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. random_state: {np.random.RandomState, int}, optional Seed for random generator. Needed for reproducibility. """ self.num_topics = num_topics self.id2word = id2word self.chunksize = chunksize self.passes = passes self._kappa = kappa self.minimum_probability = minimum_probability self._w_max_iter = w_max_iter self._w_stop_condition = w_stop_condition self._h_max_iter = h_max_iter self._h_stop_condition = h_stop_condition self.eval_every = eval_every self.normalize = normalize self.random_state = utils.get_random_state(random_state) self.v_max = None if self.id2word is None: self.id2word = utils.dict_from_corpus(corpus) self.num_tokens = len(self.id2word) self.A = None self.B = None self._W = None self.w_std = None self._w_error = np.inf self._h = None if corpus is not None: self.update(corpus)
def __init__( self, corpus=None, num_topics=100, id2word=None, chunksize=2000, passes=1, lambda_=1.0, kappa=1.0, minimum_probability=0.01, use_r=False, w_max_iter=200, w_stop_condition=1e-4, h_r_max_iter=50, h_r_stop_condition=1e-3, eval_every=10, v_max=None, normalize=True, sparse_coef=3, random_state=None, ): """ Parameters ---------- corpus : iterable of list of (int, float), optional Training corpus. If not given, model is left untrained. num_topics : int, optional Number of topics to extract. id2word: gensim.corpora.Dictionary, optional Mapping from token id to token. If not set words get replaced with word ids. chunksize: int, optional Number of documents to be used in each training chunk. passes: int, optioanl Number of full passes over the training corpus. lambda_ : float, optional Residuals regularizer coefficient. Increasing it helps prevent ovefitting. Has no effect if `use_r` is set to False. kappa : float, optional Optimizer step coefficient. Increaing it makes model train faster, but adds a risk that it won't converge. w_max_iter: int, optional Maximum number of iterations to train W matrix per each batch. w_stop_condition: float, optional If error difference gets less than that, training of matrix ``W`` stops for current batch. h_r_max_iter: int, optional Maximum number of iterations to train h and r matrices per each batch. h_r_stop_condition: float If error difference gets less than that, training of matrices ``h`` and ``r`` stops for current batch. eval_every: int, optional Number of batches after which model will be evaluated. v_max: int, optional Maximum number of word occurrences in the corpora. Inferred if not set. Rarely needs to be set explicitly. normalize: bool, optional Whether to normalize results. Offers "kind-of-probabilistic" result. sparse_coef: float, optional The more it is, the more sparse are matrices. Significantly increases performance. random_state: {np.random.RandomState, int}, optional Seed for random generator. Useful for reproducibility. """ self._w_error = None self.num_tokens = None self.num_topics = num_topics self.id2word = id2word self.chunksize = chunksize self.passes = passes self._lambda_ = lambda_ self._kappa = kappa self.minimum_probability = minimum_probability self.use_r = use_r self._w_max_iter = w_max_iter self._w_stop_condition = w_stop_condition self._h_r_max_iter = h_r_max_iter self._h_r_stop_condition = h_r_stop_condition self.v_max = v_max self.eval_every = eval_every self.normalize = normalize self.sparse_coef = sparse_coef self.random_state = utils.get_random_state(random_state) if self.id2word is None: self.id2word = utils.dict_from_corpus(corpus) self.num_tokens = len(self.id2word) self.A = None self.B = None self._W = None self.w_std = None self._h = None self._r = None if corpus is not None: self.update(corpus)
def get_inference_penalty(net, hidden_size, docs_path, topic_num): # train the lda model selected_docs = pd.read_csv(docs_path, header=None, index_col=[0]).values print 'number of docs:', selected_docs.shape # print selected_docs[:5] texts = [[word for word in doc[0].split(' ')] for doc in selected_docs] # pprint(texts[:5]) dictionary = corpora.Dictionary(texts) dictionary.save_as_text(Path+'/data-repository/available_word_in_literature.csv') print dictionary # print dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] print corpus[:5] print len(corpus) lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num, update_every=1, chunksize=1000, passes=1) # to inference the new doc # initialize the variational distribution q(theta|gamma) for the chunk init_gamma = utils.get_random_state(None).gamma(100., 1. / 100., (hidden_size, topic_num)) Elogtheta = matutils.dirichlet_expectation(init_gamma) expElogtheta = np.exp(Elogtheta) converged = 0 # Now, for each document d update that document's gamma and phi # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). for para_iter, para in enumerate(net.parameters()): if para_iter == 0: para_data = para.abs() for d, doc in enumerate(chunk): if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset ids = [int(idx) for idx, _ in doc] else: ids = [idx for idx, _ in doc] cts = np.array([cnt for _, cnt in doc]) gammad = init_gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] expElogbetad = lda_model.expElogbeta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. # phinorm is the normalizer. # TODO treat zeros explicitly, instead of adding 1e-100? phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # Iterate between gamma and phi until convergence for _ in xrange(lda_model.iterations): lastgamma = gammad # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = lda_model.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T) Elogthetad = matutils.dirichlet_expectation(gammad) expElogthetad = np.exp(Elogthetad) phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = np.mean(abs(gammad - lastgamma)) if meanchange < lda_model.gamma_threshold: converged += 1 break init_gamma[d, :] = gammad pass
def testRandomState(): testcases = [np.random.seed(0), None, np.random.RandomState(0), 0] for testcase in testcases: assert(isinstance(utils.get_random_state(testcase), np.random.RandomState))
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ `gamma`: first level concentration `alpha`: second level concentration `eta`: the topic Dirichlet `T`: top level truncation level `K`: second level truncation level `kappa`: learning rate `tau`: slow down parameter `max_time`: stop training after this many seconds `max_chunks`: stop after having processed this many chunks (wrap around corpus beginning in another corpus pass, if there are not enough chunks in the corpus) """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def __init__(self, corpus=None, num_topics=100, alpha='symmetric', beta=None, num_passes=10, minimum_prob=0.01, random_state=None, dtype=np.float32): # TODO FIX: doesn't work when instantiated without a corpus and then trained later """ Args: corpus: If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `train()` manually). num_topics: The number of requested latent topics to be extracted from the training corpus. alpha: Hyperparameter of the Dirichlet prior over the topics in a document. beta: Hyperparameter of the Dirichlet prior over the terms in a topic. num_passes: The number of passes of the MCMC procedure. One pass is one step per term in each document of the whole corpus. minimum_prob: Minimum probability required for an object (term, topic) to be displayed (TODO should remove this) random_state: TODO findout what is this dtype: Data-type to use during calculations inside model. All inputs are also converted to this dtype. Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. """ if dtype not in DTYPE_TO_EPS: raise ValueError( "Incorrect 'dtype', please choose one of {}".format(", ".join( "numpy.{}".format(tp.__name__) for tp in sorted(DTYPE_TO_EPS)))) self.dtype = dtype logger.info( "creating a new lda collapsed gibbs sampling model with {0} topics" .format(num_topics)) # store user-supplied parameters if corpus is not None: self.id2word = corpus.dictionary self.num_terms = 1 + max(self.id2word.keys()) else: self.id2word = None self.num_terms = 0 self.num_topics = int(num_topics) self.minimum_probability = minimum_prob self.random_state = utils.get_random_state(random_state) self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(beta, six.string_types): if beta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for beta") self.beta, self.optimize_beta = self.init_dir_prior(beta, 'beta') assert self.beta.shape == (self.num_terms, ) or self.beta.shape == ( self.num_topics, self.num_terms ), ("Invalid beta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.beta.shape), self.num_terms, self.num_topics, self.num_terms)) self.w_beta = sum(self.beta) # if a training corpus was provided, start estimating the model right away if corpus is not None: self.train(corpus, num_passes=num_passes)
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute the author-topic model over an empty collection (no terms)" ) logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError( "If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path)." ) if serialized and serialization_path: assert not isfile( serialization_path ), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert ( self.eta.shape == (self.num_terms, ) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, similarity_matrix=None, num_topics=100, alpha='symmetric', beta=None, smooth_factor=0.1, num_passes=10, minimum_prob=0.01, random_state=None, dtype=np.float32): # TODO Comments """ Args: corpus: If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `train()` manually). similarity_matrix: stochastic matrix representing semantic similarity between words. Should be a numpy array in dense or sparse (scipy.sparse) format. num_topics: The number of requested latent topics to be extracted from the training corpus. alpha: Hyperparameter of the Dirichlet prior over the topics in a document. beta: Hyperparameter of the Dirichlet prior over the terms in a topic. smooth_factor: parameter controlling the influence of neighbour words. num_passes: The number of passes of the MCMC procedure. One pass is one step per term in each document of the whole corpus. minimum_prob: TODO random_state: TODO dtype: Data-type to use during calculations inside model. All inputs are also converted to this dtype. Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. """ if dtype not in DTYPE_TO_EPS: raise ValueError( "Incorrect 'dtype', please choose one of {}".format(", ".join( "numpy.{}".format(tp.__name__) for tp in sorted(DTYPE_TO_EPS)))) self.dtype = dtype logger.info( "creating a new lda graph sampler model with {0} topics".format( num_topics)) # store user-supplied parameters if corpus is not None: self.id2word = corpus.dictionary self.num_terms = 1 + max(self.id2word.keys()) else: self.id2word = None self.num_terms = 0 self.num_topics = int(num_topics) self.minimum_probability = minimum_prob self.random_state = utils.get_random_state(random_state) self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(beta, six.string_types): if beta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for beta") self.beta, self.optimize_beta = self.init_dir_prior(beta, 'beta') assert self.beta.shape == (self.num_terms, ) or self.beta.shape == ( self.num_topics, self.num_terms ), ("Invalid beta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.beta.shape), self.num_terms, self.num_topics, self.num_terms)) self.w_beta = sum(self.beta) self.term_seqs, self.topic_seqs, \ self.doc_topic_counts, self.term_topic_counts, \ self.terms_per_topic = \ self.get_seqs_and_counts(corpus=corpus) self.num_docs = len(self.term_seqs) # if a training corpus was provided, start estimating the model right away if corpus is not None: self.train(corpus, similarity_matrix=similarity_matrix, smooth_factor=smooth_factor, num_passes=num_passes) self.theta, self.phi = self.get_theta_phi()
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a np.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert ( self.eta.shape == (self.num_terms, ) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError( "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy( ns.list(prefix=LDA_DISPATCHER_PREFIX) [LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def testRandomState(): testcases = [np.random.seed(0), None, np.random.RandomState(0), 0] for testcase in testcases: assert (isinstance(utils.get_random_state(testcase), np.random.RandomState))