def __init__(self, df, vocab, K, alpha, beta, random_state=None, previous_model=None): """ Initialises the collapsed Gibbs sampling for LDA Arguments: - df: the dataframe of counts of vocabularies x documents - K: no. of topics - alpha: symmetric prior on document-topic assignment - beta: symmetric prior on word-topic assignment - previous_model: previous LDA run, if any """ print "CGS LDA initialising" self.df = df.replace(np.nan, 0) self.alpha = alpha self.D = df.shape[0] # total no of docs self.N = df.shape[1] # total no of words assert(len(vocab)==self.N) # index of each word self.vocab = [v[0] for v in vocab] # the bag index for each word in self.vocab, values can only be in [0, 1, ..., n_bags-1] self.vocab_type = [int(v[1]) for v in vocab] # assume exactly 3 bags self.bag_labels = bags self.n_bags = len(self.bag_labels) if hasattr(beta, "__len__"): # beta is an np array, must be the same length as the number of bags assert(len(beta)==self.n_bags) self.beta = beta else: # beta is a scalar, convert it into np array self.beta = np.array([beta]) self.beta = np.repeat(self.beta, self.n_bags) # set total no of topics self.cv = False self.previous_model = previous_model if self.previous_model is not None: # if some old topics were fixed if hasattr(self.previous_model, 'selected_topics'): # no. of new topics self.K = K # no. of previously selected topics self.previous_K = len(self.previous_model.selected_topics) # Get the previous ckn and ck values from the training stage. # During gibbs update in this testing stage, assignment of word # to the first previous_K topics will use the previous fixed # topic-word distributions -- as specified by previous_ckn and previous_ck self.previous_ckn = self.previous_model.selected_ckn self.previous_ck = self.previous_model.selected_ck self.previous_vocab = self.previous_model.selected_vocab assert(len(self.previous_ck)==self.previous_K) assert(self.previous_ckn.shape[0]==len(self.previous_ck)) assert(self.previous_ckn.shape[1]==len(self.previous_vocab)) # make previous_ckn have the right number of columns N_diff = self.N - len(self.previous_vocab) temp = np.zeros((self.previous_K, N_diff), dtype=bag_of_word_dtype) self.previous_ckn = np.hstack((self.previous_ckn, temp)) # size is previous_K x N # make previous_ckn have the right number of rows temp = np.zeros((self.K, self.N), dtype=bag_of_word_dtype) self.previous_ckn = np.vstack((self.previous_ckn, temp)) # size is (previous_K+K) x N # make previous_ck have the right length temp = np.zeros(self.K, dtype=bag_of_word_dtype) self.previous_ck = np.hstack((self.previous_ck, temp)) # length is (previous_K+K) # total no. of topics = old + new topics self.K = self.K + self.previous_K print "Total no. of topics = " + str(self.K) else: raise ValueError("No previous topics have been selected") else: # for training stage self.K = K self.previous_ckn = np.zeros((self.K, self.N), dtype=bag_of_word_dtype) self.previous_ck = np.zeros(self.K, dtype=bag_of_word_dtype) self.previous_K = 0 # no old topics # make the current arrays too self.ckn = np.zeros((self.K, self.N), dtype=bag_of_word_dtype) self.ck = np.zeros(self.K, dtype=bag_of_word_dtype) self.cdk = np.zeros((self.D, self.K), int32) self.cd = np.zeros(self.D, int32) # make sure to get the same results from running gibbs each time if random_state is None: self.random_state = RandomState(1234567890) else: self.random_state = random_state # randomly assign words to topics self.Z = {} for d in range(self.D): if d%10==0: sys.stdout.write('.') sys.stdout.flush() document = self.df.iloc[[d]] word_idx = utils.word_indices(document) for pos, n in enumerate(word_idx): b = self.vocab_type[n] k = self.random_state.randint(self.K) self.cdk[d, k] += 1 self.cd[d] += 1 bag_label = self.bag_labels[b] bag_ckn = self.ckn[bag_label] bag_ck = self.ck[bag_label] bag_ckn[k, n] += 1 bag_ck[k] += 1 self.Z[(d, pos)] = k print # turn word counts in the document into a vector of word occurences self.document_indices = {} for d in range(self.D): document = self.df.iloc[[d]] word_idx = utils.word_indices(document) word_locs = [] for pos, n in enumerate(word_idx): word_locs.append((pos, n)) self.document_indices[d] = word_locs
def cross_validate_is(self, n_folds, n_burn, n_samples, n_thin, is_num_samples, is_iters): shuffled_df = self.df.reindex(np.random.permutation(self.df.index)) folds = np.array_split(shuffled_df, n_folds) margs = [] test_perplexity = [] for i in range(len(folds)): training_df = None testing_df = None testing_idx = -1 for j in range(len(folds)): if j == i: print "K=" + str(self.K) + " Testing fold=" + str(j) testing_df = folds[j] testing_idx = j else: print "K=" + str(self.K) + " Training fold=" + str(j) if training_df is None: training_df = folds[j] else: training_df = training_df.append(folds[j]) print "Run training gibbs " + str(training_df.shape) training_gibbs = CollapseGibbs_3bags_Lda(training_df, self.vocab, self.K, self.alpha, self.beta) training_gibbs.run(n_burn, n_samples, n_thin, use_native=True) print "Run testing importance sampling " + str(testing_df.shape) topics = training_gibbs.topic_word_ # use prior alpha topic_prior = np.ones((self.K, 1)) topic_prior = topic_prior / np.sum(topic_prior) topic_prior = topic_prior * self.K * self.alpha # # use posterior alpha inferred from the last sample of training_gibbs # topic_prior = training_gibbs.posterior_alpha[:, None] print 'topic_prior = ' + str(topic_prior) marg = 0 n_words = 0 for d in range(testing_df.shape[0]): document = self.df.iloc[[d]] words = utils.word_indices(document) doc_marg = ldae_is_variants(words, self.vocab, topics, topic_prior, num_samples=is_num_samples, variant=3, variant_iters=is_iters) print "\td = " + str(d) + " doc_marg=" + str(doc_marg) sys.stdout.flush() marg += doc_marg n_words += len(words) perp = np.exp(-(marg/n_words)) print "Log evidence " + str(testing_idx) + " = " + str(marg) print "Test perplexity " + str(testing_idx) + " = " + str(perp) print margs.append(marg) test_perplexity.append(perp) margs = np.array(marg) mean_marg = np.mean(margs) self.mean_margs = np.asscalar(mean_marg) test_perplexity = np.array(test_perplexity) mean_perplexity = np.mean(test_perplexity) self.mean_perplexities = np.asscalar(mean_perplexity) print print "Cross-validation done!" print "K=" + str(self.K) + ",mean_approximate_log_evidence=" + str(self.mean_margs) \ + ",mean_perplexity=" + str(self.mean_perplexities)