def _initialize(self, corpus, vocab, prior_tree, number_of_topics, alpha_alpha): Inferencer._initialize(self, vocab, prior_tree, number_of_topics, alpha_alpha); # initialize the documents, key by the document path, value by a list of non-stop and tokenized words, with duplication. self._corpus = corpus; self._parsed_corpus = self.parse_data(); # initialize the size of the collection, i.e., total number of documents. self._number_of_documents = len(self._parsed_corpus); ''' # initialize a D-by-K matrix gamma, valued at N_d/K #self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; # initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1 #self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types)); #self._E_log_eta = compute_dirichlet_expectation(self._eta); ''' # initialize the size of the vocabulary, i.e. total number of distinct tokens. # self._number_of_terms = len(self._type_to_index) # initialize a D-by-K matrix gamma, valued at N_d/K # self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha + 1.0 * self._number_of_paths / self._number_of_topics; # self._gamma = numpy.tile(self._alpha_alpha + 1.0 * self._number_of_terms / self._number_of_topics, (self._number_of_documents, 1)); self._gamma = self._alpha_alpha + 2.0 * self._number_of_paths / self._number_of_topics * numpy.random.random((self._number_of_documents, self._number_of_topics)); # initialize a _E_log_beta variable, indexed by node, valued by a K-by-C matrix, where C stands for the number of children of that node self._var_beta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_edges)); # for edge_index in self._index_to_edge: # self._var_beta[:, [edge_index]] += numpy.sum(phi_sufficient_statistics[:, self._paths_through_edge[edge_index]], axis=1)[:, numpy.newaxis]; '''
def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta, alpha_eta=0, alpha_sigma_square=1.0 ): Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta); self._parsed_corpus, self._responses = self.parse_data(corpus); # define the total number of document self._number_of_documents = len(self._parsed_corpus); # initialize a D-by-K matrix gamma, valued at N_d/K self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; # self._gamma = numpy.random.gamma(100., 1./100, (self._number_of_documents, self._number_of_topics)) # initialize a V-by-K matrix _eta, valued at 1/V, subject to the sum over every row is 1 self._beta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types)); # self._beta /= numpy.sum(self._beta, 1)[:, numpy.newaxis] # self._E_log_eta = compute_dirichlet_expectation(self._beta); self._eta = numpy.zeros((1, self._number_of_topics)) + alpha_eta self._sigma_square = alpha_sigma_square
def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta): Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta); #self._corpus = corpus; self._parsed_corpus = self.parse_data(corpus); # define the total number of document self._number_of_documents = len(self._parsed_corpus[0]); # initialize a D-by-K matrix gamma, valued at N_d/K self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; # initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1 self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta): Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta); self._corpus = corpus; self._parsed_corpus = self.parse_data(); # define the total number of document self._number_of_documents = len(self._parsed_corpus[0]); # initialize a D-by-K matrix gamma, valued at N_d/K self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; # initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1 self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta): Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta); self._parsed_corpus = self.parse_data(corpus); # define the total number of document self._number_of_documents = len(self._parsed_corpus); # define the counts over different topics for all documents, first indexed by doc_id id, the indexed by topic id self._n_dk = numpy.zeros((self._number_of_documents, self._number_of_topics)); # define the counts over words for all topics, first indexed by topic id, then indexed by token id self._n_kv = numpy.zeros((self._number_of_topics, self._number_of_types)); self._n_k = numpy.zeros(self._number_of_topics); # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos self._k_dn = {}; self.random_initialize();
def _initialize(self, corpus_en, voc_en, corpus_cn, voc_cn, number_of_topics_ge, alpha_alpha, alpha_beta, lam): Inferencer._initialize(self, voc_en, voc_cn, number_of_topics_ge, alpha_alpha, alpha_beta, lam=0.5) self._corpus_en = corpus_en self._corpus_cn = corpus_cn self._trans_en_cn = np.zeros( (self._number_of_types_cn, self._number_of_types_en)) self._trans_cn_en = np.zeros( (self._number_of_types_en, self._number_of_types_cn)) self.parse_data() # define the total number of document self._number_of_documents_en = len(self._word_idss_en) self._number_of_documents_cn = len(self._word_idss_cn) self._number_of_documents = self._number_of_documents_en + self._number_of_documents_cn self._n_dk_en = np.zeros( (self._number_of_documents_en, self._number_of_topics)) self._n_dk_cn = np.zeros( (self._number_of_documents_cn, self._number_of_topics)) self._n_kv_en = np.zeros( (self._number_of_topics, self._number_of_types_en)) self._n_kv_cn = np.zeros( (self._number_of_topics, self._number_of_types_cn)) # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos self._n_k_en = np.zeros(self._number_of_topics) self._n_k_cn = np.zeros(self._number_of_topics) self._k_dn_en = {} self._k_dn_cn = {} self.psi_en = np.zeros( (self._number_of_topics, self._number_of_types_en)) self.psi_cn = np.zeros( (self._number_of_topics, self._number_of_types_cn)) self.phi_en = np.zeros( (self._number_of_topics, self._number_of_types_en)) self.phi_cn = np.zeros( (self._number_of_topics, self._number_of_types_cn)) self.random_initialize()
def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta): Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta); self._corpus = corpus; self.parse_data(); # define the total number of document self._number_of_documents = len(self._word_idss); # define the counts over different topics for all documents, first indexed by doc_id id, the indexed by topic id self._n_dk = numpy.zeros((self._number_of_documents, self._number_of_topics)); # define the counts over words for all topics, first indexed by topic id, then indexed by token id self._n_kv = numpy.zeros((self._number_of_topics, self._number_of_types)); self._n_k = numpy.zeros(self._number_of_topics); # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos self._k_dn = {}; self.random_initialize();
def _initialize(self, corpus, vocab, labels, alpha_alpha, alpha_beta): Inferencer._initialize(self, vocab, labels, alpha_alpha, alpha_beta) self._number_of_topics = len(self._label_to_index) self._parsed_corpus, self._parsed_labels = self.parse_data(corpus) ''' # define the total number of document self._number_of_documents = len(self._parsed_corpus); # define the counts over different topics for all documents, first indexed by doc_id id, the indexed by topic id self._n_dk = numpy.zeros((self._number_of_documents, self._number_of_topics), dtype=int) # define the counts over words for all topics, first indexed by topic id, then indexed by token id self._n_kv = numpy.zeros((self._number_of_topics, self._number_of_types), dtype=int) self._n_k = numpy.zeros(self._number_of_topics, dtype=int) # define the topic assignment for every word in every document, first indexed by doc_id id, then indexed by word word_pos self._k_dn = {}; ''' # self._number_of_documents, self._k_dn, self._n_dk, self._n_k, self._n_kv = self.random_initialize(); self._k_dn, self._n_dk, self._n_k, self._n_kv = self.random_initialize( )