def generate_docs_with_hlda(num_docs, words_per_doc, vocab_size, topic_to_word_beta, topic_dist_m, topic_dist_pi, new_child_gamma): params = {} params["topic_to_word_param"] = [topic_to_word_beta] * vocab_size params["words_per_doc_distribution"] = lambda: util.poisson(words_per_doc) pta = topic_dist_m * topic_dist_pi ptb = topic_dist_pi - pta params["parent_topic_bias_sample"] = lambda: beta(pta, ptb) params["new_child_gamma"] = new_child_gamma topic_root = Topic_node(params) documents, topic_stay_probs, topic_paths, topics, levels = \ zip(*[generate_one_doc_with_hlda(topic_root, params) for i in range(num_docs)]) return documents, topic_root, topic_stay_probs, topic_paths, topics, levels
def generate(topics, words, words_per_doc): num_docs = len(topics) word_cdfs = [util.get_cdf(topic) for topic in words] docs = [] doc_topics = [] for i in range(num_docs): if i % 100 == 0: print "reached document", i num_words = util.poisson(words_per_doc) topic_dist = topics[i] topic_cdf = util.get_cdf(topic_dist) doc = [] word_topics = [] for word in range(num_words): topic = util.sample(topic_cdf) doc.append(util.sample(word_cdfs[topic])) word_topics.append(topic) docs.append(doc) doc_topics.append(word_topics) return docs, doc_topics
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30, alpha=None, beta=None, noise=-1, plsi=False, ctm=False, pareto=False): """Generates documents according to plsi, ctm, or lda Args: num_topics: the number of underlying latent topics num_docs: the number of documents to generate words_per_doc: parameter to a Poisson distribution; determines the average words in a documents vocab_size: the number of words in the vocabulary DISTRIBUTION PARAMETERS --------------------- depending on which model, alpha and beta are parameters to different distributions LDA: Assumes symmetric dirichlet distributions (ie all elements in the parameter vector have the same value) alpha: parameter to dirichlet distribution for topics beta: parameter to dirichlet distribution for words PLSI: alpha: parameter to poisson distribution to determine the number of topics per document (each topic will have uniform probability; all other topics will have probability 0) beta: as alpha, but poisson distribution instead controls the number of words per topic (each word will have uniform probability; all other words will have probability 0) --------------------- noise: given as a probability; each word will be replaced with a random word with noise probability plsi: flag to draw distributions according to plsi (ie random distributions) ctm: flag to draw distributions according to ctm (ie a multivariate gaussian distribution) pareto: flag to make dirichlet distribution pareto (ie for the dirichlet parameter, set each alpha_i = alpha / alpha_i) Returns: docs: the list of documents, each a list of words (represented by their indices in range(vocab_size) topics: a list of documents, each a list of topics (represented by their indices in range(num_topics) word_dist: the distribution over words for each topic; each row is the distribution for a different topic topics_dist: the distribution over topics for each document; each row is the distribution for a different document """ #@TODO: integrate ctm parameters (ie mu and sigma) into alpha and beta mu = np.zeros(num_topics) sigma = np.ones((num_topics, num_topics)) if plsi and ctm: print "plsi and ctm flags cannot both be active (returning None)" return None if not plsi and not ctm: if pareto: alpha = [alpha / i for i in range(1, num_topics + 1)] beta = [np.sqrt(beta / i) for i in range(1, vocab_size + 1)] #beta = [beta / i for i in range(1, vocab_size + 1)] else: alpha = [alpha] * num_topics beta = [beta] * vocab_size if plsi or ctm: sig_words = [rsample(range(vocab_size), util.poisson(beta, vocab_size))\ for t in range(num_topics)] word_dist = [np.zeros(vocab_size) for t in range(num_topics)] for i in range(num_topics): word_dist[i][sig_words[i]] = 1.0 / len(sig_words[i]) else: word_dist = [dirichlet(beta) for i in range(num_topics)] word_cdfs = [] for topic in word_dist: word_cdfs.append(get_cdf(topic)) topic_cdfs = [] docs = [] topics = [] topic_dists = [] doc_index = 0 for i in range(num_docs): if doc_index % 100 == 0: print "reached document", doc_index if plsi: sig_topics = rsample(range(num_topics), util.poisson(alpha, num_topics)) topic_dist = np.zeros(num_topics) topic_dist[sig_topics] = 1.0 / len(sig_topics) elif ctm: eta = N(mu, sigma) topic_dist = np.exp(eta) / np.sum(np.exp(eta)) else: topic_dist = dirichlet(alpha) num_words = util.poisson(words_per_doc) doc = [] topic_dists.append(topic_dist) topic_cdf = get_cdf(topic_dist) topic_cdfs.append(topic_cdf) doc_topics = [] for word in range(num_words): if rand() < noise: doc.append(rsample(range(vocab_size), 1)) doc_topics.append(-1) else: topic = sample(topic_cdf) doc.append(sample(word_cdfs[topic])) doc_topics.append(topic) docs.append(doc) topics.append(doc_topics) doc_index += 1 return docs, topics, word_dist, topic_dists