예제 #1
0
    def _initialize(self, corpus, vocab, prior_tree, number_of_topics, alpha_alpha):
        Inferencer._initialize(self, vocab, prior_tree, number_of_topics, alpha_alpha);

        # initialize the documents, key by the document path, value by a list of non-stop and tokenized words, with duplication.
        self._corpus = corpus;
        self._parsed_corpus = self.parse_data();
        
        # initialize the size of the collection, i.e., total number of documents.
        self._number_of_documents = len(self._parsed_corpus);

        '''
        # initialize a D-by-K matrix gamma, valued at N_d/K
        #self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;

        # initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1
        #self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
        #self._E_log_eta = compute_dirichlet_expectation(self._eta);
        '''

        # initialize the size of the vocabulary, i.e. total number of distinct tokens.
        # self._number_of_terms = len(self._type_to_index)
        
        # initialize a D-by-K matrix gamma, valued at N_d/K
        # self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha + 1.0 * self._number_of_paths / self._number_of_topics;
        # self._gamma = numpy.tile(self._alpha_alpha + 1.0 * self._number_of_terms / self._number_of_topics, (self._number_of_documents, 1));
        self._gamma = self._alpha_alpha + 2.0 * self._number_of_paths / self._number_of_topics * numpy.random.random((self._number_of_documents, self._number_of_topics));
        
        # initialize a _E_log_beta variable, indexed by node, valued by a K-by-C matrix, where C stands for the number of children of that node
        self._var_beta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_edges));
        # for edge_index in self._index_to_edge:
            # self._var_beta[:, [edge_index]] += numpy.sum(phi_sufficient_statistics[:, self._paths_through_edge[edge_index]], axis=1)[:, numpy.newaxis];
            
        '''
예제 #2
0
    def _initialize(self,
                    corpus,
                    vocab,
                    number_of_topics,
                    alpha_alpha,
                    alpha_beta,
                    alpha_eta=0,
                    alpha_sigma_square=1.0
                    ):
        Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta);

        self._parsed_corpus, self._responses = self.parse_data(corpus);
        
        # define the total number of document
        self._number_of_documents = len(self._parsed_corpus);
        
        # initialize a D-by-K matrix gamma, valued at N_d/K
        self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        # self._gamma = numpy.random.gamma(100., 1./100, (self._number_of_documents, self._number_of_topics))
        
        # initialize a V-by-K matrix _eta, valued at 1/V, subject to the sum over every row is 1
        self._beta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
        # self._beta /= numpy.sum(self._beta, 1)[:, numpy.newaxis]
        # self._E_log_eta = compute_dirichlet_expectation(self._beta);
        
        self._eta = numpy.zeros((1, self._number_of_topics)) + alpha_eta
        self._sigma_square = alpha_sigma_square
예제 #3
0
    def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta):
        Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta);

        #self._corpus = corpus;
        self._parsed_corpus = self.parse_data(corpus);
        
        # define the total number of document
        self._number_of_documents = len(self._parsed_corpus[0]);
        
        # initialize a D-by-K matrix gamma, valued at N_d/K
        self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;

        # initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1
        self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
예제 #4
0
    def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta):
        Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta);

        self._corpus = corpus;
        self._parsed_corpus = self.parse_data();
        
        # define the total number of document
        self._number_of_documents = len(self._parsed_corpus[0]);
        
        # initialize a D-by-K matrix gamma, valued at N_d/K
        self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;

        # initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1
        self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
예제 #5
0
파일: monte_carlo.py 프로젝트: jz3707/PyLDA
 def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta):
     Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta);
     
     self._parsed_corpus = self.parse_data(corpus);
     
     # define the total number of document
     self._number_of_documents = len(self._parsed_corpus);
     
     # define the counts over different topics for all documents, first indexed by doc_id id, the indexed by topic id
     self._n_dk = numpy.zeros((self._number_of_documents, self._number_of_topics));
     # define the counts over words for all topics, first indexed by topic id, then indexed by token id
     self._n_kv = numpy.zeros((self._number_of_topics, self._number_of_types));
     self._n_k = numpy.zeros(self._number_of_topics);
     # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos
     self._k_dn = {};
     
     self.random_initialize();
예제 #6
0
    def _initialize(self, corpus_en, voc_en, corpus_cn, voc_cn,
                    number_of_topics_ge, alpha_alpha, alpha_beta, lam):
        Inferencer._initialize(self,
                               voc_en,
                               voc_cn,
                               number_of_topics_ge,
                               alpha_alpha,
                               alpha_beta,
                               lam=0.5)
        self._corpus_en = corpus_en
        self._corpus_cn = corpus_cn
        self._trans_en_cn = np.zeros(
            (self._number_of_types_cn, self._number_of_types_en))
        self._trans_cn_en = np.zeros(
            (self._number_of_types_en, self._number_of_types_cn))
        self.parse_data()

        # define the total number of document
        self._number_of_documents_en = len(self._word_idss_en)
        self._number_of_documents_cn = len(self._word_idss_cn)
        self._number_of_documents = self._number_of_documents_en + self._number_of_documents_cn

        self._n_dk_en = np.zeros(
            (self._number_of_documents_en, self._number_of_topics))
        self._n_dk_cn = np.zeros(
            (self._number_of_documents_cn, self._number_of_topics))
        self._n_kv_en = np.zeros(
            (self._number_of_topics, self._number_of_types_en))
        self._n_kv_cn = np.zeros(
            (self._number_of_topics, self._number_of_types_cn))
        # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos
        self._n_k_en = np.zeros(self._number_of_topics)
        self._n_k_cn = np.zeros(self._number_of_topics)
        self._k_dn_en = {}
        self._k_dn_cn = {}
        self.psi_en = np.zeros(
            (self._number_of_topics, self._number_of_types_en))
        self.psi_cn = np.zeros(
            (self._number_of_topics, self._number_of_types_cn))
        self.phi_en = np.zeros(
            (self._number_of_topics, self._number_of_types_en))
        self.phi_cn = np.zeros(
            (self._number_of_topics, self._number_of_types_cn))

        self.random_initialize()
예제 #7
0
 def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta):
     Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta);
     
     self._corpus = corpus;
     self.parse_data();
     
     # define the total number of document
     self._number_of_documents = len(self._word_idss);
     
     # define the counts over different topics for all documents, first indexed by doc_id id, the indexed by topic id
     self._n_dk = numpy.zeros((self._number_of_documents, self._number_of_topics));
     # define the counts over words for all topics, first indexed by topic id, then indexed by token id
     self._n_kv = numpy.zeros((self._number_of_topics, self._number_of_types));
     self._n_k = numpy.zeros(self._number_of_topics);
     # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos
     self._k_dn = {};
     
     self.random_initialize();
예제 #8
0
    def _initialize(self, corpus, vocab, labels, alpha_alpha, alpha_beta):
        Inferencer._initialize(self, vocab, labels, alpha_alpha, alpha_beta)

        self._number_of_topics = len(self._label_to_index)

        self._parsed_corpus, self._parsed_labels = self.parse_data(corpus)
        '''
        # define the total number of document
        self._number_of_documents = len(self._parsed_corpus);
        
        # define the counts over different topics for all documents, first indexed by doc_id id, the indexed by topic id
        self._n_dk = numpy.zeros((self._number_of_documents, self._number_of_topics), dtype=int)
        # define the counts over words for all topics, first indexed by topic id, then indexed by token id
        self._n_kv = numpy.zeros((self._number_of_topics, self._number_of_types), dtype=int)
        self._n_k = numpy.zeros(self._number_of_topics, dtype=int)
        # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos
        self._k_dn = {};
        '''

        # self._number_of_documents, self._k_dn, self._n_dk, self._n_k, self._n_kv = self.random_initialize();
        self._k_dn, self._n_dk, self._n_k, self._n_kv = self.random_initialize(
        )