Пример #1
0
    def __init__(self, df, vocab, K, alpha, beta, random_state=None, previous_model=None):
        """
        Initialises the collapsed Gibbs sampling for LDA
        
        Arguments:
        - df: the dataframe of counts of vocabularies x documents
        - K: no. of topics
        - alpha: symmetric prior on document-topic assignment
        - beta: symmetric prior on word-topic assignment
        - previous_model: previous LDA run, if any
        """
        
        print "CGS LDA initialising"
        self.df = df.replace(np.nan, 0)
        self.alpha = alpha            

        self.D = df.shape[0]    # total no of docs
        self.N = df.shape[1]    # total no of words
        assert(len(vocab)==self.N)
        
        # index of each word
        self.vocab = [v[0] for v in vocab] 
        
        # the bag index for each word in self.vocab, values can only be in [0, 1, ..., n_bags-1]
        self.vocab_type = [int(v[1]) for v in vocab]
            
        # assume exactly 3 bags
        self.bag_labels = bags
        self.n_bags = len(self.bag_labels)

        if hasattr(beta, "__len__"):
            # beta is an np array, must be the same length as the number of bags
            assert(len(beta)==self.n_bags)
            self.beta = beta
        else:
            # beta is a scalar, convert it into np array
            self.beta = np.array([beta])
            self.beta = np.repeat(self.beta, self.n_bags)

        # set total no of topics
        self.cv = False
        self.previous_model = previous_model
        if self.previous_model is not None:
            
            # if some old topics were fixed
            if hasattr(self.previous_model, 'selected_topics'):
            
                # no. of new topics
                self.K = K
            
                # no. of previously selected topics
                self.previous_K = len(self.previous_model.selected_topics)                
                
                # Get the previous ckn and ck values from the training stage.
                # During gibbs update in this testing stage, assignment of word 
                # to the first previous_K topics will use the previous fixed 
                # topic-word distributions -- as specified by previous_ckn and previous_ck
                self.previous_ckn = self.previous_model.selected_ckn
                self.previous_ck = self.previous_model.selected_ck
                self.previous_vocab = self.previous_model.selected_vocab
                assert(len(self.previous_ck)==self.previous_K)
                assert(self.previous_ckn.shape[0]==len(self.previous_ck))
                assert(self.previous_ckn.shape[1]==len(self.previous_vocab))
                
                # make previous_ckn have the right number of columns
                N_diff = self.N - len(self.previous_vocab)
                temp = np.zeros((self.previous_K, N_diff), dtype=bag_of_word_dtype)
                self.previous_ckn = np.hstack((self.previous_ckn, temp)) # size is previous_K x N
                
                # make previous_ckn have the right number of rows
                temp = np.zeros((self.K, self.N), dtype=bag_of_word_dtype)
                self.previous_ckn = np.vstack((self.previous_ckn, temp)) # size is (previous_K+K) x N

                # make previous_ck have the right length
                temp = np.zeros(self.K, dtype=bag_of_word_dtype)
                self.previous_ck = np.hstack((self.previous_ck, temp)) # length is (previous_K+K)

                # total no. of topics = old + new topics
                self.K = self.K + self.previous_K
                print "Total no. of topics = " + str(self.K)
                                
                
            else:                
                raise ValueError("No previous topics have been selected")
                
        else:

            # for training stage
            self.K = K            
            self.previous_ckn = np.zeros((self.K, self.N), dtype=bag_of_word_dtype)
            self.previous_ck = np.zeros(self.K, dtype=bag_of_word_dtype)        
            self.previous_K = 0 # no old topics

        # make the current arrays too
        self.ckn = np.zeros((self.K, self.N), dtype=bag_of_word_dtype)
        self.ck = np.zeros(self.K, dtype=bag_of_word_dtype)        
        self.cdk = np.zeros((self.D, self.K), int32)
        self.cd = np.zeros(self.D, int32)

        # make sure to get the same results from running gibbs each time
        if random_state is None:
            self.random_state = RandomState(1234567890)
        else:
            self.random_state = random_state

        # randomly assign words to topics
        self.Z = {}        
        for d in range(self.D):
            if d%10==0:
                sys.stdout.write('.')
                sys.stdout.flush()
            document = self.df.iloc[[d]]
            word_idx = utils.word_indices(document)
            for pos, n in enumerate(word_idx):
                b = self.vocab_type[n]
                k = self.random_state.randint(self.K)
                self.cdk[d, k] += 1
                self.cd[d] += 1
                bag_label = self.bag_labels[b]
                bag_ckn = self.ckn[bag_label]
                bag_ck = self.ck[bag_label]
                bag_ckn[k, n] += 1
                bag_ck[k] += 1
                self.Z[(d, pos)] = k
        print

        # turn word counts in the document into a vector of word occurences
        self.document_indices = {}
        for d in range(self.D):
            document = self.df.iloc[[d]]
            word_idx = utils.word_indices(document)
            word_locs = []
            for pos, n in enumerate(word_idx):
                word_locs.append((pos, n))
            self.document_indices[d] = word_locs
Пример #2
0
    def cross_validate_is(self, n_folds, n_burn, n_samples, n_thin,
                          is_num_samples, is_iters):
    
        shuffled_df = self.df.reindex(np.random.permutation(self.df.index))
        folds = np.array_split(shuffled_df, n_folds)
                
        margs = []
        test_perplexity = []
        for i in range(len(folds)):
            
            training_df = None
            testing_df = None
            testing_idx = -1
            for j in range(len(folds)):
                if j == i:
                    print "K=" + str(self.K) + " Testing fold=" + str(j)
                    testing_df = folds[j]
                    testing_idx = j
                else:
                    print "K=" + str(self.K) + " Training fold=" + str(j)
                    if training_df is None:
                        training_df = folds[j]
                    else:
                        training_df = training_df.append(folds[j])

            print "Run training gibbs " + str(training_df.shape)
            training_gibbs = CollapseGibbs_3bags_Lda(training_df, self.vocab, self.K, self.alpha, self.beta)
            training_gibbs.run(n_burn, n_samples, n_thin, use_native=True)
            
            print "Run testing importance sampling " + str(testing_df.shape)
            topics = training_gibbs.topic_word_

            # use prior alpha
            topic_prior = np.ones((self.K, 1))
            topic_prior = topic_prior / np.sum(topic_prior)            
            topic_prior = topic_prior * self.K * self.alpha
            
#             # use posterior alpha inferred from the last sample of training_gibbs
#             topic_prior = training_gibbs.posterior_alpha[:, None]

            print 'topic_prior = ' + str(topic_prior)
            marg = 0         
            n_words = 0
            for d in range(testing_df.shape[0]):
                document = self.df.iloc[[d]]
                words = utils.word_indices(document)
                doc_marg = ldae_is_variants(words, self.vocab, topics, topic_prior, 
                                         num_samples=is_num_samples, variant=3, variant_iters=is_iters)
                print "\td = " + str(d) + " doc_marg=" + str(doc_marg)
                sys.stdout.flush()                
                marg += doc_marg              
                n_words += len(words)

            perp = np.exp(-(marg/n_words))
            print "Log evidence " + str(testing_idx) + " = " + str(marg)
            print "Test perplexity " + str(testing_idx) + " = " + str(perp)
            print
            margs.append(marg)
            test_perplexity.append(perp)
            
        margs = np.array(marg)
        mean_marg = np.mean(margs)
        self.mean_margs = np.asscalar(mean_marg)

        test_perplexity = np.array(test_perplexity)
        mean_perplexity = np.mean(test_perplexity)
        self.mean_perplexities = np.asscalar(mean_perplexity)        
        
        print
        print "Cross-validation done!"
        print "K=" + str(self.K) + ",mean_approximate_log_evidence=" + str(self.mean_margs) \
            + ",mean_perplexity=" + str(self.mean_perplexities)