def load_mini_batch(self): (docset, articlenames) = get_random_wikipedia_articles(self.batch_size) path_articles = save_articles_per_batch(docset, articlenames, self.data_wiki_folder) save_articles(docset, articlenames, self.data_wiki_folder) # add new terms to the vocabulary set raw_data = PreProcessing(path_articles, remove_rare_word=2, remove_common_word=0.5) raw_data.process() old_vocab = list() f_vocab = open(self.vocab_file) line = f_vocab.readline().strip() while line: old_vocab.append(line) line = f_vocab.readline().strip() f_vocab.close() new_vocab = set(raw_data.vocab) in_new_but_not_in_old = new_vocab - set(old_vocab) result_vocab = old_vocab + list(in_new_but_not_in_old) self.vocab_file = self.data_wiki_folder + '/current_vocab.txt' f_new_vocab = open(self.vocab_file, 'w') for term in result_vocab: f_new_vocab.write(term + '\n') f_new_vocab.close() # create corpus to store mini-batch dict_vocab = read_vocab(self.vocab_file) corpus = Corpus(DataFormat.TERM_SEQUENCE) for doc in raw_data.list_doc: for i in range(len(doc)): doc[i] = dict_vocab[raw_data.vocab[doc[i]]] if len(doc) > 0: corpus.append_doc(doc, len(doc)) logging.info("Mini batch no: %s", self.mini_batch_no) if self.output_format == DataFormat.TERM_FREQUENCY: mini_batch = utilizies.convert_corpus_format( corpus, DataFormat.TERM_FREQUENCY) else: mini_batch = corpus self.mini_batch_no += 1 return mini_batch
def infer_new_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY) theta, index = self.e_step(docs.word_ids_tks, docs.cts_lens) return theta
def infer_new_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_SEQUENCE) N_phi, N_Z, theta = self.e_step(docs.word_ids_tks, docs.cts_lens) return theta
def infer_new_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_SEQUENCE) theta, z = self.sample_z(docs.word_ids_tks, docs.cts_lens) return theta
def infer_new_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY) gamma, sstats = self.e_step(docs.word_ids_tks, docs.cts_lens) gamma_norm = gamma.sum(axis=1) theta = gamma / gamma_norm[:, n.newaxis] return theta